diff --git a/ant-bridge.sh b/ant-bridge.sh index a2f686586..af94fa435 100755 --- a/ant-bridge.sh +++ b/ant-bridge.sh @@ -1,6 +1,7 @@ #!/bin/sh -mvn_args="verify" +default_args="verify '-Ddisable.shadepackage'" +mvn_args="${default_args}" mvn_properties= mvn_clean= unknown_args= @@ -44,22 +45,23 @@ for arg in "${@}" ; do fi else - if [[ "${arg}" != "dist" && "${mvn_args}" != "" && "${mvn_args}" != "verify" ]] ; then + if [[ "${arg}" != "dist" && "${mvn_args}" != "" && "${mvn_args}" != "${default_args}" ]] ; then echo "Sorry, this script does not currently support mixing targets." >&2 exit 1 elif [[ "${arg}" == "dist" ]] ; then - mvn_args="verify" + mvn_args="${default_args}" elif [[ "${arg}" == "gatk" ]] ; then - mvn_args="verify '-P!queue'" + mvn_args="${default_args} '-P!queue'" elif [[ "${arg}" == "test.compile" ]] ; then mvn_args="test-compile" elif [[ "${arg}" == "gatkdocs" ]] ; then local_repo="sitetemprepo" - mvn_args="install -Dmaven.repo.local=${local_repo} -Ddisable.queue && mvn site -Dmaven.repo.local=${local_repo} -Ddisable.queue" + mvn_args="install -Dmaven.repo.local=${local_repo} '-P!queue' && mvn site -Dmaven.repo.local=${local_repo} '-P!queue'" + mvn_pkg_args= elif [[ "${arg}" == "package.gatk.full" ]] ; then mvn_args="package '-P!private,!queue'" @@ -75,11 +77,11 @@ for arg in "${@}" ; do # elif [[ "${arg}" == "release.gatk.full" ]] ; then # mvn_args="package '-P!private,!queue'" -# post_script=" && private/src/main/scripts/shell/copy_release.sh public/gatk-package/target/GenomeAnalysisTK-*.tar.bz2" +# post_script=" && private/src/main/scripts/shell/copy_release.sh protected/gatk-package-distribution/target/GenomeAnalysisTK-*.tar.bz2" # elif [[ "${arg}" == "release.queue.full" ]] ; then # mvn_args="package '-P!private'" -# post_script=" && private/src/main/scripts/shell/copy_release.sh public/queue-package/target/Queue-*.tar.bz2" +# post_script=" && private/src/main/scripts/shell/copy_release.sh protected/gatk-queue-package-distribution/target/Queue-*.tar.bz2" elif [[ "${arg}" == "build-picard-private" ]] ; then mvn_args="mvn install -f private/picard-maven/pom.xml" @@ -113,7 +115,7 @@ for arg in "${@}" ; do mvn_args="${mvn_args} -Dgatk.queuetests.run=true" elif [[ "${arg}" == "committests" ]] ; then - mvn_args="verify -Dgatk.committests.skipped=false" + mvn_args="${default_args} -Dgatk.committests.skipped=false" elif [[ "${arg}" == "test" ]] ; then mvn_args="test -Dgatk.unittests.skipped=false" @@ -122,19 +124,19 @@ for arg in "${@}" ; do mvn_args="test -Dgatk.unittests.skipped=false" elif [[ "${arg}" == "integrationtest" ]] ; then - mvn_args="verify -Dgatk.integrationtests.skipped=false" + mvn_args="${default_args} -Dgatk.integrationtests.skipped=false" elif [[ "${arg}" == "largescaletest" ]] ; then - mvn_args="verify -Dgatk.largescaletests.skipped=false" + mvn_args="${default_args} -Dgatk.largescaletests.skipped=false" elif [[ "${arg}" == "knowledgebasetest" ]] ; then - mvn_args="verify -Dgatk.knowledgebasetests.skipped=false" + mvn_args="${default_args} -Dgatk.knowledgebasetests.skipped=false" elif [[ "${arg}" == "queuetest" ]] ; then - mvn_args="verify -Dgatk.queuetests.skipped=false" + mvn_args="${default_args} -Dgatk.queuetests.skipped=false" elif [[ "${arg}" == "queuetestrun" ]] ; then - mvn_args="verify -Dgatk.queuetests.skipped=false -Dgatk.queuetests.run=true" + mvn_args="${default_args} -Dgatk.queuetests.skipped=false -Dgatk.queuetests.run=true" elif [[ "${arg}" == "fasttest" ]] ; then mvn_args="verify -Dgatk.committests.skipped=false -pl private/gatk-tools-private -am -Dresource.bundle.skip=true" diff --git a/pom.xml b/pom.xml index 8488cf87f..95440ec8a 100644 --- a/pom.xml +++ b/pom.xml @@ -13,7 +13,7 @@ org.broadinstitute.gatk gatk-root - 3.3 + 3.4-SNAPSHOT public/gatk-root @@ -32,11 +32,15 @@ false -build-timestamp "${maven.build.timestamp}" + ${gatk.basedir}/public/src/main/scripts/shell + ${gatk.basedir}/public/src/main/assembly package + prepare-package + package generate-resources process-resources process-test-resources @@ -65,6 +69,16 @@ ${gatk.serialcommittests.skipped} true true + + + ${gatk.basedir}/target/executable + + ${gatk.basedir}/target/package + + ${gatk.basedir}/target @@ -138,9 +152,19 @@ ${resource.bundle.path} + + executable-jar-lib + + copy-dependencies + + none + + ${gatk.executable.directory}/lib + runtime + + - org.apache.maven.plugins maven-resources-plugin @@ -159,25 +183,6 @@ ${gatk.process-test-resources.phase} - - copy-resource-bundle-log4j - - copy-resources - - none - - - ${project.reporting.outputDirectory}/apidocs - - - ${gatk.basedir}/gatk-utils/src/main/config/org/broadinstitute/gatk/utils/help - - - - @@ -198,8 +203,7 @@ ${project.build.outputDirectory} ${project.groupId} - - gatk-tools-public + ${project.artifactId} ${project.version} 2g @@ -319,10 +323,40 @@ org.apache.maven.plugins maven-jar-plugin + + executable-jar + + jar + + none + + ${project.build.outputDirectory}/ignored_by_executable_jar + ${gatk.executable.directory} + ${gatk.binary-dist.name} + + + ${app.main.class} + true + lib/ + + + + default-jar ${gatk.jar.phase} + + + unshaded-default-jar + + jar + + none + test-jar @@ -341,13 +375,14 @@ maven-shade-plugin - gatk-executable + package-jar shade none true + false org.broadinstitute.gatk:gsalib:tar.gz:* @@ -405,7 +440,7 @@ none - src/main/assembly/binary-dist.xml + ${gatk.assembly.directory}/binary-dist.xml @@ -437,7 +472,7 @@ - link-binary-jar + link-executable-jar link @@ -445,7 +480,26 @@ - ${gatk.basedir}/target/${gatk.binary-dist.name}.${project.packaging} + ${gatk.shortcut.directory}/${gatk.binary-dist.name}.${project.packaging} + ${gatk.executable.directory}/${gatk.binary-dist.name}.${project.packaging} + + + + + + link-package-jar + + link + + none + + + + ${gatk.package.directory}/${gatk.binary-dist.name}.${project.packaging} + ${project.build.directory}/${project.build.finalName}.${project.packaging} + + + ${gatk.shortcut.directory}/${gatk.binary-dist.name}.${project.packaging} ${project.build.directory}/${project.build.finalName}.${project.packaging} @@ -624,6 +678,21 @@ org.codehaus.mojo exec-maven-plugin + + + check-utils-engine-tools + + exec + + process-sources + false + + ${gatk.shell.directory}/check_utils_engine_tools.sh + + false - org.broadinstitute.gatk.utils.help.GATKDoclet + org.broadinstitute.gatk.tools.walkers.help.WalkerDoclet ${project.groupId} gatk-package-distribution @@ -733,6 +802,26 @@ + + + + fast + + + disable.shadepackage + + + + none + none + + + packagetests-enabled @@ -746,6 +835,8 @@ true true none + none + none none none none diff --git a/protected/gatk-package-distribution/pom.xml b/protected/gatk-package-distribution/pom.xml index f8f530afa..c48ffad7b 100644 --- a/protected/gatk-package-distribution/pom.xml +++ b/protected/gatk-package-distribution/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.3 + 3.4-SNAPSHOT ../.. @@ -15,8 +15,6 @@ ${project.basedir}/../.. - prepare-package - package org.broadinstitute.gatk.engine.CommandLineGATK GenomeAnalysisTK @@ -43,9 +41,14 @@ gatk-tools-protected ${project.version} + + + org.slf4j + slf4j-log4j12 + - samtools + com.github.samtools htsjdk @@ -73,7 +76,7 @@ ${project.groupId} - gatk-engine + gatk-utils ${project.version} example-resources tar.bz2 @@ -164,6 +167,25 @@ + + org.apache.maven.plugins + maven-jar-plugin + + + executable-jar + ${gatk.jar.phase} + + + default-jar + none + + + unshaded-default-jar + ${gatk.jar.phase} + + + + org.apache.maven.plugins maven-dependency-plugin @@ -172,6 +194,10 @@ unpack-direct-dependencies ${gatk.unpack.phase} + + executable-jar-lib + ${gatk.jar.phase} + @@ -180,7 +206,7 @@ maven-shade-plugin - gatk-executable + package-jar ${gatk.shade.phase} @@ -202,7 +228,11 @@ maven-junction-plugin - link-binary-jar + link-executable-jar + ${gatk.jar.phase} + + + link-package-jar ${gatk.shade.phase} @@ -231,20 +261,6 @@ - - packagetests-enabled - - - gatk.packagetests.enabled - true - - - - none - none - none - - gsadev diff --git a/protected/gatk-package-distribution/src/main/assembly/binary-dist.xml b/protected/gatk-package-distribution/src/main/assembly/binary-dist.xml deleted file mode 100644 index 11fb98e00..000000000 --- a/protected/gatk-package-distribution/src/main/assembly/binary-dist.xml +++ /dev/null @@ -1,22 +0,0 @@ - - binary-dist - - tar.bz2 - - false - - - - org.broadinstitute.gatk:gatk-package-distribution - - ${gatk.binary-dist.name}.${artifact.extension} - - - resources - true - - org.broadinstitute.gatk:gatk-engine:tar.bz2:example-resources - - - - diff --git a/protected/gatk-queue-extensions-distribution/pom.xml b/protected/gatk-queue-extensions-distribution/pom.xml index 9f82d9edc..6b8e52749 100644 --- a/protected/gatk-queue-extensions-distribution/pom.xml +++ b/protected/gatk-queue-extensions-distribution/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.3 + 3.4-SNAPSHOT ../.. @@ -41,6 +41,10 @@ log4j log4j + + com.github.broadinstitute + picard + ${project.groupId} - gatk-tools-public + gatk-utils ${project.version} test-jar test diff --git a/protected/gatk-queue-extensions-distribution/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/examples/ExampleFindCoveredIntervals.scala b/protected/gatk-queue-extensions-distribution/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/examples/ExampleFindCoveredIntervals.scala new file mode 100644 index 000000000..48a393cb6 --- /dev/null +++ b/protected/gatk-queue-extensions-distribution/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/examples/ExampleFindCoveredIntervals.scala @@ -0,0 +1,76 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.queue.qscripts.examples + +import org.broadinstitute.gatk.queue.QScript +import org.broadinstitute.gatk.queue.extensions.gatk._ + +class ExampleFindCoveredIntervals extends QScript { + @Input(doc="The reference file for the bam files.", shortName="R") + var referenceFile: File = _ + + @Input(doc="Bam file to genotype.", shortName="I") + var bamFile: File = _ + + @Output(doc="Bam output", shortName="out") + var outFile: File = _ + + def script() { + val fci = new FindCoveredIntervals + fci.R = referenceFile + fci.memoryLimit = 2 + fci.scatterCount = 3 + fci.I :+= bamFile + fci.out = outFile + add(fci) + } +} diff --git a/protected/gatk-queue-extensions-distribution/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/examples/ExampleHaplotypeCaller.scala b/protected/gatk-queue-extensions-distribution/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/examples/ExampleHaplotypeCaller.scala new file mode 100644 index 000000000..de7cdd4c6 --- /dev/null +++ b/protected/gatk-queue-extensions-distribution/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/examples/ExampleHaplotypeCaller.scala @@ -0,0 +1,80 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.queue.qscripts.examples + +import org.broadinstitute.gatk.queue.QScript +import org.broadinstitute.gatk.queue.extensions.gatk._ + +class ExampleHaplotypeCaller extends QScript { + @Input(doc="The reference file for the bam files.", shortName="R") + var referenceFile: File = _ + + @Input(doc="Bam file to genotype.", shortName="I") + var bamFile: File = _ + + @Output(doc="VCF output", shortName="out") + var outFile: File = _ + + @Argument(doc="One or more genomic intervals over which to operate", shortName="L", required=false) + var intervals: Seq[String] = Nil + + def script() { + val hc = new HaplotypeCaller + hc.R = referenceFile + hc.memoryLimit = 2 + hc.scatterCount = 3 + hc.I :+= bamFile + hc.out = outFile + hc.intervalsString = intervals + add(hc) + } +} diff --git a/protected/gatk-queue-extensions-distribution/src/test/scala/org/broadinstitute/gatk/queue/pipeline/examples/ExampleUnifiedGenotyperQueueTest.scala b/protected/gatk-queue-extensions-distribution/src/test/scala/org/broadinstitute/gatk/queue/pipeline/examples/ExampleUnifiedGenotyperQueueTest.scala index c4e81047f..d49bee6a5 100644 --- a/protected/gatk-queue-extensions-distribution/src/test/scala/org/broadinstitute/gatk/queue/pipeline/examples/ExampleUnifiedGenotyperQueueTest.scala +++ b/protected/gatk-queue-extensions-distribution/src/test/scala/org/broadinstitute/gatk/queue/pipeline/examples/ExampleUnifiedGenotyperQueueTest.scala @@ -91,7 +91,7 @@ class ExampleUnifiedGenotyperQueueTest { " -R " + BaseTest.hg18Reference, " -I " + BaseTest.validationDataLocation + "OV-0930.normal.chunk.bam", " -L " + intervalsPath).mkString - spec.jobRunners = Seq("Lsf706") + spec.jobRunners = Seq("GridEngine") QueueTest.executeTest(spec) } diff --git a/protected/gatk-queue-extensions-distribution/src/test/scala/org/broadinstitute/gatk/queue/pipeline/examples/UnmappedExcludedQueueTest.scala b/protected/gatk-queue-extensions-distribution/src/test/scala/org/broadinstitute/gatk/queue/pipeline/examples/UnmappedExcludedQueueTest.scala new file mode 100644 index 000000000..997f0e9ef --- /dev/null +++ b/protected/gatk-queue-extensions-distribution/src/test/scala/org/broadinstitute/gatk/queue/pipeline/examples/UnmappedExcludedQueueTest.scala @@ -0,0 +1,93 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.queue.pipeline.examples + +import org.broadinstitute.gatk.queue.pipeline.{QueueTest, QueueTestSpec} +import org.broadinstitute.gatk.utils.BaseTest +import org.testng.annotations.Test + +class UnmappedExcludedQueueTest { + + @Test(timeOut=36000000) + def testUnmappedExclusion(): Unit = { + + //FindCoveredIntervals is an ActiveRegionWalker, which throws an exception if it encounters unmapped reads + //But it's partitioned by contigs, which by default includes unmapped reads. Verify that the unmapped reads + //are correctly not added in this case + val testOut = "fci.out" + val spec = new QueueTestSpec + spec.name = "findcoveredintervals" + spec.args = Array( + " -S " + QueueTest.protectedQScriptsPackageDir + "examples/ExampleFindCoveredIntervals.scala", + " -R " + BaseTest.publicTestDir + "exampleFASTA.fasta", + " -I " + BaseTest.publicTestDir + "exampleBAM.bam", + " -out " + testOut).mkString + + //The output file is blank - the real test is simply that it runs to completion + spec.fileMD5s += testOut -> "d41d8cd98f00b204e9800998ecf8427e" + QueueTest.executeTest(spec) + + //Regression Test: HaplotypeCaller is also an ActiveRegionWalker, and is much more widely used. Explicitly test + //it as well + val hcTestOut = "hctest.vcf" + val hcSpec = new QueueTestSpec + hcSpec.name = "haplotypecaller" + hcSpec.args = Array( + " -S " + QueueTest.protectedQScriptsPackageDir + "examples/ExampleHaplotypeCaller.scala", + " -R " + BaseTest.publicTestDir + "exampleFASTA.fasta", + " -I " + BaseTest.publicTestDir + "exampleBAM.bam", + " -out " + hcTestOut).mkString + + //The output file is blank - the real test is simply that it runs to completion + QueueTest.executeTest(hcSpec) + } +} diff --git a/protected/gatk-queue-package-distribution/pom.xml b/protected/gatk-queue-package-distribution/pom.xml index 20de01afb..cb653c91b 100644 --- a/protected/gatk-queue-package-distribution/pom.xml +++ b/protected/gatk-queue-package-distribution/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.3 + 3.4-SNAPSHOT ../.. @@ -15,8 +15,6 @@ ${project.basedir}/../.. - prepare-package - package Queue org.broadinstitute.gatk.queue.QCommandLine @@ -49,7 +47,7 @@ - picard + com.github.broadinstitute picard @@ -80,7 +78,7 @@ ${project.groupId} - gatk-engine + gatk-utils ${project.version} example-resources tar.bz2 @@ -171,6 +169,25 @@ + + org.apache.maven.plugins + maven-jar-plugin + + + executable-jar + ${gatk.jar.phase} + + + default-jar + none + + + unshaded-default-jar + ${gatk.jar.phase} + + + + org.apache.maven.plugins maven-dependency-plugin @@ -179,6 +196,10 @@ unpack-direct-dependencies ${gatk.unpack.phase} + + executable-jar-lib + ${gatk.jar.phase} + @@ -187,7 +208,7 @@ maven-shade-plugin - gatk-executable + package-jar ${gatk.shade.phase} @@ -209,7 +230,11 @@ maven-junction-plugin - link-binary-jar + link-executable-jar + ${gatk.jar.phase} + + + link-package-jar ${gatk.shade.phase} @@ -238,20 +263,6 @@ - - packagetests-enabled - - - gatk.packagetests.enabled - true - - - - none - none - none - - gsadev diff --git a/protected/gatk-queue-package-distribution/src/main/assembly/binary-dist.xml b/protected/gatk-queue-package-distribution/src/main/assembly/binary-dist.xml deleted file mode 100644 index daa974216..000000000 --- a/protected/gatk-queue-package-distribution/src/main/assembly/binary-dist.xml +++ /dev/null @@ -1,23 +0,0 @@ - - binary-dist - - tar.bz2 - - false - - - - org.broadinstitute.gatk:gatk-queue-package-distribution - - ${gatk.binary-dist.name}.${artifact.extension} - - - resources - true - - org.broadinstitute.gatk:gatk-engine:tar.bz2:example-resources - org.broadinstitute.gatk:gatk-queue-extensions-public:tar.bz2:example-resources - - - - diff --git a/protected/gatk-tools-protected/pom.xml b/protected/gatk-tools-protected/pom.xml index 6f026c827..3df22c1a1 100644 --- a/protected/gatk-tools-protected/pom.xml +++ b/protected/gatk-tools-protected/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.3 + 3.4-SNAPSHOT ../.. @@ -48,7 +48,15 @@ ${project.groupId} - gatk-tools-public + gatk-utils + ${project.version} + test-jar + test + + + + ${project.groupId} + gatk-engine ${project.version} test-jar test @@ -63,16 +71,6 @@ - - org.apache.maven.plugins - maven-resources-plugin - - - copy-resource-bundle-log4j - prepare-package - - - org.apache.maven.plugins maven-javadoc-plugin diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java deleted file mode 100644 index bf52849d6..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java +++ /dev/null @@ -1,232 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.engine.arguments; - -import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorImplementation; -import org.broadinstitute.gatk.utils.commandline.*; -import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypingOutputMode; -import org.broadinstitute.gatk.tools.walkers.genotyper.OutputMode; -import org.broadinstitute.gatk.utils.collections.DefaultHashMap; -import htsjdk.variant.variantcontext.VariantContext; - -import java.io.File; -import java.lang.reflect.Field; -import java.lang.reflect.Method; -import java.lang.reflect.Modifier; -import java.util.Collections; -import java.util.Map; - -/** - * Created with IntelliJ IDEA. - * User: rpoplin - * Date: 8/20/12 - * A collection of arguments that are common to the various callers. - * This is pulled out so that every caller isn't exposed to the arguments from every other caller. - */ - -public class StandardCallerArgumentCollection implements Cloneable { - - @ArgumentCollection - public GenotypeCalculationArgumentCollection genotypeArgs = new GenotypeCalculationArgumentCollection(); - - @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false) - public GenotypingOutputMode genotypingOutputMode = GenotypingOutputMode.DISCOVERY; - - /** - * When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding - */ - @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", required=false) - public RodBinding alleles; - - /** - * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads. - * Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we - * will try to remove (N * contamination fraction) bases for each alternate allele. - */ - @Argument(fullName = "contamination_fraction_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false) - public double CONTAMINATION_FRACTION = DEFAULT_CONTAMINATION_FRACTION; - public static final double DEFAULT_CONTAMINATION_FRACTION = 0.0; - - /** - * This argument specifies a file with two columns "sample" and "contamination" specifying the contamination level for those samples. - * Samples that do not appear in this file will be processed with CONTAMINATION_FRACTION. - **/ - @Advanced - @Argument(fullName = "contamination_fraction_per_sample_file", shortName = "contaminationFile", doc = "Tab-separated File containing fraction of contamination in sequencing data (per sample) to aggressively remove. Format should be \"\" (Contamination is double) per line; No header.", required = false) - public File CONTAMINATION_FRACTION_FILE = null; - - /** - * Indicates whether there is some sample contamination present. - */ - private boolean sampleContaminationWasLoaded = false; - - /** - * - * @return an _Immutable_ copy of the Sample-Contamination Map, defaulting to CONTAMINATION_FRACTION so that if the sample isn't in the map map(sample)==CONTAMINATION_FRACTION - */ - public Map getSampleContamination(){ - //make sure that the default value is set up right - sampleContamination.setDefaultValue(CONTAMINATION_FRACTION); - if (!Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0) - sampleContaminationWasLoaded = true; - return Collections.unmodifiableMap(sampleContamination); - } - - public void setSampleContamination(DefaultHashMap sampleContamination) { - this.sampleContamination.clear(); - this.sampleContaminationWasLoaded = !Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0; - if (!sampleContaminationWasLoaded) - for (final Double d : sampleContamination.values()) - if (!Double.isNaN(d) && d > 0.0) { - sampleContaminationWasLoaded = true; - break; - } - this.sampleContamination.putAll(sampleContamination); - this.sampleContamination.setDefaultValue(CONTAMINATION_FRACTION); - } - - /** - * Returns true if there is some sample contamination present, false otherwise. - * @return {@code true} iff there is some sample contamination - */ - public boolean isSampleContaminationPresent() { - return (!Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0) || sampleContaminationWasLoaded; - } - - //Needs to be here because it uses CONTAMINATION_FRACTION - private DefaultHashMap sampleContamination = new DefaultHashMap(CONTAMINATION_FRACTION); - - /** - * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus. - */ - @Hidden - @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) - public AFCalculatorImplementation requestedAlleleFrequencyCalculationModel; - - @Hidden - @Argument(shortName = "logExactCalls", doc="x", required=false) - public File exactCallsLog = null; - - @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false) - public OutputMode outputMode = OutputMode.EMIT_VARIANTS_ONLY; - - /** - * Advanced, experimental argument: if SNP likelihood model is specified, and if EMIT_ALL_SITES output mode is set, when we set this argument then we will also emit PLs at all sites. - * This will give a measure of reference confidence and a measure of which alt alleles are more plausible (if any). - * WARNINGS: - * - This feature will inflate VCF file size considerably. - * - All SNP ALT alleles will be emitted with corresponding 10 PL values. - * - An error will be emitted if EMIT_ALL_SITES is not set, or if anything other than diploid SNP model is used - */ - @Advanced - @Argument(fullName = "allSitePLs", shortName = "allSitePLs", doc = "Annotate all sites with PLs", required = false) - public boolean annotateAllSitesWithPLs = false; - - /** - * Creates a Standard caller argument collection with default values. - */ - public StandardCallerArgumentCollection() { } - - /** - * "Casts" a caller argument collection into another type. - * - *

Common fields values are copied across

- * @param clazz the class of the result. - * @param result argument collection class. - * @return never {@code null}. - */ - public T cloneTo(final Class clazz) { - // short cut: just use regular clone if it happens to be the same class. - if (clazz == getClass()) - return (T) clone(); - try { - final T result = clazz.newInstance(); - for (final Field field : getClass().getFields()) { - // just copy common fields. - if (!field.getDeclaringClass().isAssignableFrom(clazz)) - continue; - final int fieldModifiers = field.getModifiers(); - if ((fieldModifiers & UNCOPYABLE_MODIFIER_MASK) != 0) continue; - //Use the clone() method if appropriate - if (Cloneable.class.isAssignableFrom(field.getType())) { - Method clone = field.getType().getMethod("clone"); - field.set(result, clone.invoke(field.get(this))); - } else - field.set(result,field.get(this)); - } - return result; - } catch (final Exception ex) { - throw new IllegalStateException(ex); - } - } - - /** - * Creates a copy of this configuration. - * @return never {@code null}. - */ - @Override - public StandardCallerArgumentCollection clone() { - try { - StandardCallerArgumentCollection cloned = (StandardCallerArgumentCollection) super.clone(); - cloned.genotypeArgs = genotypeArgs.clone(); - return cloned; - } catch (CloneNotSupportedException e) { - throw new IllegalStateException("unreachable code"); - } - } - - /** - * Holds a modifiers mask that identifies those fields that cannot be copied between - * StandardCallerArgumentCollections. - */ - private final int UNCOPYABLE_MODIFIER_MASK = Modifier.PRIVATE | Modifier.STATIC | Modifier.FINAL; -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRGatherer.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRGatherer.java new file mode 100644 index 000000000..9ad2282ea --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRGatherer.java @@ -0,0 +1,138 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.apache.commons.collections.CollectionUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.commandline.Gatherer; +import org.broadinstitute.gatk.utils.report.GATKReport; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.*; + +/** + * User: carneiro + * Date: 3/29/11 + */ + + +public class BQSRGatherer extends Gatherer { + + private static final Logger logger = Logger.getLogger(BQSRGatherer.class); + private static final String EMPTY_INPUT_LIST = "list of inputs files is empty or there is no usable data in any input file"; + private static final String MISSING_OUTPUT_FILE = "missing output file name"; + private static final String MISSING_READ_GROUPS = "Missing read group(s)"; + + @Override + public void gather(final List inputs, final File output) { + final PrintStream outputFile; + try { + outputFile = new PrintStream(output); + } catch(FileNotFoundException e) { + throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE); + } + final GATKReport report = gatherReport(inputs); + report.print(outputFile); + } + + /** + * Gathers the input recalibration reports into a single report. + * + * @param inputs Input recalibration GATK reports + * @return gathered recalibration GATK report + */ + public static GATKReport gatherReport(final List inputs) { + final SortedSet allReadGroups = new TreeSet(); + final LinkedHashMap> inputReadGroups = new LinkedHashMap>(); + + // Get the read groups from each input report + for (final File input : inputs) { + final Set readGroups = RecalibrationReport.getReadGroups(input); + inputReadGroups.put(input, readGroups); + allReadGroups.addAll(readGroups); + } + + // Log the read groups that are missing from specific inputs + for (Map.Entry> entry: inputReadGroups.entrySet()) { + final File input = entry.getKey(); + final Set readGroups = entry.getValue(); + if (allReadGroups.size() != readGroups.size()) { + // Since this is not completely unexpected, more than debug, but less than a proper warning. + logger.info(MISSING_READ_GROUPS + ": " + input.getAbsolutePath()); + for (final Object readGroup: CollectionUtils.subtract(allReadGroups, readGroups)) { + logger.info(" " + readGroup); + } + } + } + + RecalibrationReport generalReport = null; + for (File input : inputs) { + final RecalibrationReport inputReport = new RecalibrationReport(input, allReadGroups); + if( inputReport.isEmpty() ) { continue; } + + if (generalReport == null) + generalReport = inputReport; + else + generalReport.combine(inputReport); + } + if (generalReport == null) + throw new ReviewedGATKException(EMPTY_INPUT_LIST); + + generalReport.calculateQuantizedQualities(); + + return generalReport.createGATKReport(); + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRReadTransformer.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRReadTransformer.java new file mode 100644 index 000000000..b524ad08a --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRReadTransformer.java @@ -0,0 +1,104 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.WalkerManager; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +/** + * A ReadTransformer that applies BQSR on the fly to reads + * + * User: rpoplin + * Date: 2/13/12 + */ +public class BQSRReadTransformer extends ReadTransformer { + private boolean enabled; + private BaseRecalibration bqsr = null; + + @Override + public OrderingConstraint getOrderingConstraint() { return OrderingConstraint.MUST_BE_FIRST; } + + @Override + public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { + this.enabled = engine.hasBQSRArgumentSet(); + if ( enabled ) { + // TODO -- See important note below about applying BQSR to a reduced BAM file: + // If it is important to make sure that BQSR is not applied (as opposed to having the covariates computed) against a reduced bam file, + // we need to figure out how to make this work. The problem is that the ReadTransformers are initialized before the ReadDataSource + // inside the GenomeAnalysisEngine, so we generate a NPE when trying to retrieve the SAMFileHeaders. Ultimately, I don't think this is + // a necessary check anyways since we disallow running BaseRecalibrator on reduced bams (so we can't generate the recal tables to use here). + // Although we could add this check to the apply() method below, it's kind of ugly and inefficient. + // The call here would be: RecalUtils.checkForInvalidRecalBams(engine.getSAMFileHeaders(), engine.getArguments().ALLOW_BQSR_ON_REDUCED_BAMS); + final BQSRArgumentSet args = engine.getBQSRArgumentSet(); + this.bqsr = new BaseRecalibration(args.getRecalFile(), args.getQuantizationLevels(), args.shouldDisableIndelQuals(), args.getPreserveQscoresLessThan(), args.shouldEmitOriginalQuals(), args.getGlobalQScorePrior()); + } + final BQSRMode mode = WalkerManager.getWalkerAnnotation(walker, BQSRMode.class); + return mode.ApplicationTime(); + } + + @Override + public boolean enabled() { + return enabled; + } + + /** + * initialize a new BQSRReadTransformer that applies BQSR on the fly to incoming reads. + */ + @Override + public GATKSAMRecord apply(GATKSAMRecord read) { + bqsr.recalibrateRead(read); + return read; + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BaseRecalibration.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BaseRecalibration.java new file mode 100644 index 000000000..9095f695e --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BaseRecalibration.java @@ -0,0 +1,208 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import com.google.java.contract.Ensures; +import htsjdk.samtools.SAMTag; +import htsjdk.samtools.SAMUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.recalibration.EventType; +import org.broadinstitute.gatk.engine.recalibration.covariates.Covariate; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +/** + * Utility methods to facilitate on-the-fly base quality score recalibration. + * + * User: carneiro and rpoplin + * Date: 2/4/12 + */ + +public class BaseRecalibration { + private static Logger logger = Logger.getLogger(BaseRecalibration.class); + private final static boolean TEST_CACHING = false; + + private final QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done) + private final RecalibrationTables recalibrationTables; + private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation + + private final boolean disableIndelQuals; + private final int preserveQLessThan; + private final double globalQScorePrior; + private final boolean emitOriginalQuals; + + /** + * Constructor using a GATK Report file + * + * @param RECAL_FILE a GATK Report file containing the recalibration information + * @param quantizationLevels number of bins to quantize the quality scores + * @param disableIndelQuals if true, do not emit base indel qualities + * @param preserveQLessThan preserve quality scores less than this value + */ + public BaseRecalibration(final File RECAL_FILE, final int quantizationLevels, final boolean disableIndelQuals, final int preserveQLessThan, final boolean emitOriginalQuals, final double globalQScorePrior) { + RecalibrationReport recalibrationReport = new RecalibrationReport(RECAL_FILE); + + recalibrationTables = recalibrationReport.getRecalibrationTables(); + requestedCovariates = recalibrationReport.getRequestedCovariates(); + quantizationInfo = recalibrationReport.getQuantizationInfo(); + if (quantizationLevels == 0) // quantizationLevels == 0 means no quantization, preserve the quality scores + quantizationInfo.noQuantization(); + else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wants to use what's in the report. + quantizationInfo.quantizeQualityScores(quantizationLevels); + + this.disableIndelQuals = disableIndelQuals; + this.preserveQLessThan = preserveQLessThan; + this.globalQScorePrior = globalQScorePrior; + this.emitOriginalQuals = emitOriginalQuals; + } + + /** + * Recalibrates the base qualities of a read + * + * It updates the base qualities of the read with the new recalibrated qualities (for all event types) + * + * Implements a serial recalibration of the reads using the combinational table. + * First, we perform a positional recalibration, and then a subsequent dinuc correction. + * + * Given the full recalibration table, we perform the following preprocessing steps: + * + * - calculate the global quality score shift across all data [DeltaQ] + * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift + * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual + * - The final shift equation is: + * + * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) + * + * @param read the read to recalibrate + */ + public void recalibrateRead(final GATKSAMRecord read) { + if (emitOriginalQuals && read.getAttribute(SAMTag.OQ.name()) == null) { // Save the old qualities if the tag isn't already taken in the read + try { + read.setAttribute(SAMTag.OQ.name(), SAMUtils.phredToFastq(read.getBaseQualities())); + } catch (IllegalArgumentException e) { + throw new UserException.MalformedBAM(read, "illegal base quality encountered; " + e.getMessage()); + } + } + + final ReadCovariates readCovariates = RecalUtils.computeCovariates(read, requestedCovariates); + final int readLength = read.getReadLength(); + + for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings + if (disableIndelQuals && errorModel != EventType.BASE_SUBSTITUTION) { + read.setBaseQualities(null, errorModel); + continue; + } + + final byte[] quals = read.getBaseQualities(errorModel); + + // get the keyset for this base using the error model + final int[][] fullReadKeySet = readCovariates.getKeySet(errorModel); + + // the rg key is constant over the whole read, the global deltaQ is too + final int rgKey = fullReadKeySet[0][0]; + final RecalDatum empiricalQualRG = recalibrationTables.getReadGroupTable().get(rgKey, errorModel.ordinal()); + + if( empiricalQualRG != null ) { + final double epsilon = ( globalQScorePrior > 0.0 && errorModel.equals(EventType.BASE_SUBSTITUTION) ? globalQScorePrior : empiricalQualRG.getEstimatedQReported() ); + + for (int offset = 0; offset < readLength; offset++) { // recalibrate all bases in the read + final byte origQual = quals[offset]; + + // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality) + if ( origQual >= preserveQLessThan ) { + // get the keyset for this base using the error model + final int[] keySet = fullReadKeySet[offset]; + final RecalDatum empiricalQualQS = recalibrationTables.getQualityScoreTable().get(keySet[0], keySet[1], errorModel.ordinal()); + final List empiricalQualCovs = new ArrayList(); + for (int i = 2; i < requestedCovariates.length; i++) { + if (keySet[i] < 0) { + continue; + } + empiricalQualCovs.add(recalibrationTables.getTable(i).get(keySet[0], keySet[1], keySet[i], errorModel.ordinal())); + } + + double recalibratedQualDouble = hierarchicalBayesianQualityEstimate( epsilon, empiricalQualRG, empiricalQualQS, empiricalQualCovs ); + + // recalibrated quality is bound between 1 and MAX_QUAL + final byte recalibratedQual = QualityUtils.boundQual(MathUtils.fastRound(recalibratedQualDouble), RecalDatum.MAX_RECALIBRATED_Q_SCORE); + + // return the quantized version of the recalibrated quality + final byte recalibratedQualityScore = quantizationInfo.getQuantizedQuals().get(recalibratedQual); + + quals[offset] = recalibratedQualityScore; + } + } + } + + // finally update the base qualities in the read + read.setBaseQualities(quals, errorModel); + } + } + + @Ensures("result > 0.0") + protected static double hierarchicalBayesianQualityEstimate( final double epsilon, final RecalDatum empiricalQualRG, final RecalDatum empiricalQualQS, final List empiricalQualCovs ) { + final double globalDeltaQ = ( empiricalQualRG == null ? 0.0 : empiricalQualRG.getEmpiricalQuality(epsilon) - epsilon ); + final double deltaQReported = ( empiricalQualQS == null ? 0.0 : empiricalQualQS.getEmpiricalQuality(globalDeltaQ + epsilon) - (globalDeltaQ + epsilon) ); + double deltaQCovariates = 0.0; + for( final RecalDatum empiricalQualCov : empiricalQualCovs ) { + deltaQCovariates += ( empiricalQualCov == null ? 0.0 : empiricalQualCov.getEmpiricalQuality(deltaQReported + globalDeltaQ + epsilon) - (deltaQReported + globalDeltaQ + epsilon) ); + } + + return epsilon + globalDeltaQ + deltaQReported + deltaQCovariates; + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/QualQuantizer.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/QualQuantizer.java new file mode 100644 index 000000000..b01359fca --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/QualQuantizer.java @@ -0,0 +1,500 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.report.GATKReport; +import org.broadinstitute.gatk.utils.report.GATKReportTable; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.io.PrintStream; +import java.util.*; + +/** + * A general algorithm for quantizing quality score distributions to use a specific number of levels + * + * Takes a histogram of quality scores and a desired number of levels and produces a + * map from original quality scores -> quantized quality scores. + * + * Note that this data structure is fairly heavy-weight, holding lots of debugging and + * calculation information. If you want to use it efficiently at scale with lots of + * read groups the right way to do this: + * + * Map> map + * for each read group rg: + * hist = getQualHist(rg) + * QualQuantizer qq = new QualQuantizer(hist, nLevels, minInterestingQual) + * map.set(rg, qq.getOriginalToQuantizedMap()) + * + * This map would then be used to look up the appropriate original -> quantized + * quals for each read as it comes in. + * + * @author Mark Depristo + * @since 3/2/12 + */ +public class QualQuantizer { + final private static Set MY_EMPTY_SET = Collections.emptySet(); + + private static Logger logger = Logger.getLogger(QualQuantizer.class); + + /** + * Inputs to the QualQuantizer + */ + final int nLevels, minInterestingQual; + final List nObservationsPerQual; + + /** + * Map from original qual (e.g., Q30) to new quantized qual (e.g., Q28). + * + * Has the same range as nObservationsPerQual + */ + final List originalToQuantizedMap; + + /** Sorted set of qual intervals. + * + * After quantize() this data structure contains only the top-level qual intervals + */ + final TreeSet quantizedIntervals; + + /** + * Protected creator for testng use only + */ + protected QualQuantizer(final int minInterestingQual) { + this.nObservationsPerQual = Collections.emptyList(); + this.nLevels = 0; + this.minInterestingQual = minInterestingQual; + this.quantizedIntervals = null; + this.originalToQuantizedMap = null; + } + + /** + * Creates a QualQuantizer for the histogram that has nLevels + * + * Note this is the only interface to the system. After creating this object + * the map can be obtained via getOriginalToQuantizedMap() + * + * @param nObservationsPerQual A histogram of counts of bases with quality scores. Note that + * this histogram must start at 0 (i.e., get(0) => count of Q0 bases) and must include counts all the + * way up to the largest quality score possible in the reads. OK if the histogram includes many 0 + * count bins, as these are quantized for free. + * @param nLevels the desired number of distinct quality scores to represent the full original range. Must + * be at least 1. + * @param minInterestingQual All quality scores <= this value are considered uninteresting and are freely + * merged together. For example, if this value is 10, then Q0-Q10 are all considered free to merge, and + * quantized into a single value. For ILMN data with lots of Q2 bases this results in a Q2 bin containing + * all data with Q0-Q10. + */ + public QualQuantizer(final List nObservationsPerQual, final int nLevels, final int minInterestingQual) { + this.nObservationsPerQual = nObservationsPerQual; + this.nLevels = nLevels; + this.minInterestingQual = minInterestingQual; + + // some sanity checking + if ( Collections.min(nObservationsPerQual) < 0 ) throw new ReviewedGATKException("Quality score histogram has negative values at: " + Utils.join(", ", nObservationsPerQual)); + if ( nLevels < 0 ) throw new ReviewedGATKException("nLevels must be >= 0"); + if ( minInterestingQual < 0 ) throw new ReviewedGATKException("minInterestingQual must be >= 0"); + + // actually run the quantizer + this.quantizedIntervals = quantize(); + + // store the map + this.originalToQuantizedMap = intervalsToMap(quantizedIntervals); + } + + /** + * Represents an contiguous interval of quality scores. + * + * qStart and qEnd are inclusive, so qStart = qEnd = 2 is the quality score bin of 2 + */ + @Invariant({ + "qStart <= qEnd", + "qStart >= 0", + "qEnd <= 1000", + "nObservations >= 0", + "nErrors >= 0", + "nErrors <= nObservations", + "fixedQual >= -1 && fixedQual <= QualityUtils.MAX_SAM_QUAL_SCORE", + "mergeOrder >= 0"}) + protected final class QualInterval implements Comparable { + final int qStart, qEnd, fixedQual, level; + final long nObservations, nErrors; + final Set subIntervals; + + /** for debugging / visualization. When was this interval created? */ + int mergeOrder; + + protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level) { + this(qStart, qEnd, nObservations, nErrors, level, -1, MY_EMPTY_SET); + } + + protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final Set subIntervals) { + this(qStart, qEnd, nObservations, nErrors, level, -1, subIntervals); + } + + protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final int fixedQual) { + this(qStart, qEnd, nObservations, nErrors, level, fixedQual, MY_EMPTY_SET); + } + + @Requires("level >= 0") + public QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final int fixedQual, final Set subIntervals) { + this.qStart = qStart; + this.qEnd = qEnd; + this.nObservations = nObservations; + this.nErrors = nErrors; + this.fixedQual = fixedQual; + this.level = level; + this.mergeOrder = 0; + this.subIntervals = Collections.unmodifiableSet(subIntervals); + } + + /** + * @return Human readable name of this interval: e.g., 10-12 + */ + public String getName() { + return qStart + "-" + qEnd; + } + + @Override + public String toString() { + return "QQ:" + getName(); + } + + /** + * @return the error rate (in real space) of this interval, or 0 if there are no observations + */ + @Ensures("result >= 0.0") + public double getErrorRate() { + if ( hasFixedQual() ) + return QualityUtils.qualToErrorProb((byte)fixedQual); + else if ( nObservations == 0 ) + return 0.0; + else + return (nErrors+1) / (1.0 * (nObservations+1)); + } + + /** + * @return the QUAL of the error rate of this interval, or the fixed qual if this interval was created with a fixed qual. + */ + @Ensures("result >= 0 && result <= QualityUtils.MAX_SAM_QUAL_SCORE") + public byte getQual() { + if ( ! hasFixedQual() ) + return QualityUtils.errorProbToQual(getErrorRate()); + else + return (byte)fixedQual; + } + + /** + * @return true if this bin is using a fixed qual + */ + public boolean hasFixedQual() { + return fixedQual != -1; + } + + @Override + public int compareTo(final QualInterval qualInterval) { + return Integer.valueOf(this.qStart).compareTo(qualInterval.qStart); + } + + /** + * Create a interval representing the merge of this interval and toMerge + * + * Errors and observations are combined + * Subintervals updated in order of left to right (determined by qStart) + * Level is 1 + highest level of this and toMerge + * Order must be updated elsewhere + * + * @param toMerge + * @return newly created merged QualInterval + */ + @Requires({"toMerge != null"}) + @Ensures({ + "result != null", + "result.nObservations >= this.nObservations", + "result.nObservations >= toMerge.nObservations", + "result.nErrors >= this.nErrors", + "result.nErrors >= toMerge.nErrors", + "result.qStart == Math.min(this.qStart, toMerge.qStart)", + "result.qEnd == Math.max(this.qEnd, toMerge.qEnd)", + "result.level > Math.max(this.level, toMerge.level)", + "result.subIntervals.size() == 2" + }) + public QualInterval merge(final QualInterval toMerge) { + final QualInterval left = this.compareTo(toMerge) < 0 ? this : toMerge; + final QualInterval right = this.compareTo(toMerge) < 0 ? toMerge : this; + + if ( left.qEnd + 1 != right.qStart ) + throw new ReviewedGATKException("Attempting to merge non-contiguous intervals: left = " + left + " right = " + right); + + final long nCombinedObs = left.nObservations + right.nObservations; + final long nCombinedErr = left.nErrors + right.nErrors; + + final int level = Math.max(left.level, right.level) + 1; + final Set subIntervals = new HashSet(Arrays.asList(left, right)); + QualInterval merged = new QualInterval(left.qStart, right.qEnd, nCombinedObs, nCombinedErr, level, subIntervals); + + return merged; + } + + public double getPenalty() { + return calcPenalty(getErrorRate()); + } + + + /** + * Calculate the penalty of this interval, given the overall error rate for the interval + * + * If the globalErrorRate is e, this value is: + * + * sum_i |log10(e_i) - log10(e)| * nObservations_i + * + * each the index i applies to all leaves of the tree accessible from this interval + * (found recursively from subIntervals as necessary) + * + * @param globalErrorRate overall error rate in real space against which we calculate the penalty + * @return the cost of approximating the bins in this interval with the globalErrorRate + */ + @Requires("globalErrorRate >= 0.0") + @Ensures("result >= 0.0") + private double calcPenalty(final double globalErrorRate) { + if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty + return 0.0; + + if ( subIntervals.isEmpty() ) { + // this is leave node + if ( this.qEnd <= minInterestingQual ) + // It's free to merge up quality scores below the smallest interesting one + return 0; + else { + return (Math.abs(Math.log10(getErrorRate()) - Math.log10(globalErrorRate))) * nObservations; + } + } else { + double sum = 0; + for ( final QualInterval interval : subIntervals ) + sum += interval.calcPenalty(globalErrorRate); + return sum; + } + } + } + + /** + * Main method for computing the quantization intervals. + * + * Invoked in the constructor after all input variables are initialized. Walks + * over the inputs and builds the min. penalty forest of intervals with exactly nLevel + * root nodes. Finds this min. penalty forest via greedy search, so is not guarenteed + * to find the optimal combination. + * + * TODO: develop a smarter algorithm + * + * @return the forest of intervals with size == nLevels + */ + @Ensures({"! result.isEmpty()", "result.size() == nLevels"}) + private TreeSet quantize() { + // create intervals for each qual individually + final TreeSet intervals = new TreeSet(); + for ( int qStart = 0; qStart < getNQualsInHistogram(); qStart++ ) { + final long nObs = nObservationsPerQual.get(qStart); + final double errorRate = QualityUtils.qualToErrorProb((byte)qStart); + final double nErrors = nObs * errorRate; + final QualInterval qi = new QualInterval(qStart, qStart, nObs, (int)Math.floor(nErrors), 0, (byte)qStart); + intervals.add(qi); + } + + // greedy algorithm: + // while ( n intervals >= nLevels ): + // find intervals to merge with least penalty + // merge it + while ( intervals.size() > nLevels ) { + mergeLowestPenaltyIntervals(intervals); + } + + return intervals; + } + + /** + * Helper function that finds and merges together the lowest penalty pair of intervals + * @param intervals + */ + @Requires("! intervals.isEmpty()") + private void mergeLowestPenaltyIntervals(final TreeSet intervals) { + // setup the iterators + final Iterator it1 = intervals.iterator(); + final Iterator it1p = intervals.iterator(); + it1p.next(); // skip one + + // walk over the pairs of left and right, keeping track of the pair with the lowest merge penalty + QualInterval minMerge = null; + if ( logger.isDebugEnabled() ) logger.debug("mergeLowestPenaltyIntervals: " + intervals.size()); + int lastMergeOrder = 0; + while ( it1p.hasNext() ) { + final QualInterval left = it1.next(); + final QualInterval right = it1p.next(); + final QualInterval merged = left.merge(right); + lastMergeOrder = Math.max(Math.max(lastMergeOrder, left.mergeOrder), right.mergeOrder); + if ( minMerge == null || (merged.getPenalty() < minMerge.getPenalty() ) ) { + if ( logger.isDebugEnabled() ) logger.debug(" Updating merge " + minMerge); + minMerge = merged; + } + } + + // now actually go ahead and merge the minMerge pair + if ( logger.isDebugEnabled() ) logger.debug(" => final min merge " + minMerge); + intervals.removeAll(minMerge.subIntervals); + intervals.add(minMerge); + minMerge.mergeOrder = lastMergeOrder + 1; + if ( logger.isDebugEnabled() ) logger.debug("updated intervals: " + intervals); + } + + /** + * Given a final forest of intervals constructs a list mapping + * list.get(i) => quantized qual to use for original quality score i + * + * This function should be called only once to initialize the corresponding + * cached value in this object, as the calculation is a bit costly. + * + * @param intervals + * @return + */ + @Ensures("result.size() == getNQualsInHistogram()") + private List intervalsToMap(final TreeSet intervals) { + final List map = new ArrayList(getNQualsInHistogram()); + map.addAll(Collections.nCopies(getNQualsInHistogram(), Byte.MIN_VALUE)); + for ( final QualInterval interval : intervals ) { + for ( int q = interval.qStart; q <= interval.qEnd; q++ ) { + map.set(q, interval.getQual()); + } + } + + if ( Collections.min(map) == Byte.MIN_VALUE ) + throw new ReviewedGATKException("quantized quality score map contains an un-initialized value"); + + return map; + } + + @Ensures("result > 0") + private final int getNQualsInHistogram() { + return nObservationsPerQual.size(); + } + + /** + * Write out a GATKReport to visualize the QualQuantization process of this data + * @param out + */ + public void writeReport(PrintStream out) { + final GATKReport report = new GATKReport(); + + addQualHistogramToReport(report); + addIntervalsToReport(report); + + report.print(out); + } + + private final void addQualHistogramToReport(final GATKReport report) { + report.addTable("QualHistogram", "Quality score histogram provided to report", 2); + GATKReportTable table = report.getTable("QualHistogram"); + + table.addColumn("qual"); + table.addColumn("count"); + + for ( int q = 0; q < nObservationsPerQual.size(); q++ ) { + table.set(q, "qual", q); + table.set(q, "count", nObservationsPerQual.get(q)); + } + } + + + private final void addIntervalsToReport(final GATKReport report) { + report.addTable("QualQuantizerIntervals", "Table of QualQuantizer quantization intervals", 10); + GATKReportTable table = report.getTable("QualQuantizerIntervals"); + + table.addColumn("name"); + table.addColumn("qStart"); + table.addColumn("qEnd"); + table.addColumn("level"); + table.addColumn("merge.order"); + table.addColumn("nErrors"); + table.addColumn("nObservations"); + table.addColumn("qual"); + table.addColumn("penalty"); + table.addColumn("root.node"); + //table.addColumn("subintervals", "NA"); + + for ( QualInterval interval : quantizedIntervals ) + addIntervalToReport(table, interval, true); + } + + private final void addIntervalToReport(final GATKReportTable table, final QualInterval interval, final boolean atRootP) { + final String name = interval.getName(); + table.set(name, "name", name); + table.set(name, "qStart", interval.qStart); + table.set(name, "qEnd", interval.qEnd); + table.set(name, "level", interval.level); + table.set(name, "merge.order", interval.mergeOrder); + table.set(name, "nErrors", interval.nErrors); + table.set(name, "nObservations", interval.nObservations); + table.set(name, "qual", interval.getQual()); + table.set(name, "penalty", String.format("%.1f", interval.getPenalty())); + table.set(name, "root.node", atRootP); + + for ( final QualInterval sub : interval.subIntervals ) + addIntervalToReport(table, sub, false); + } + + public List getOriginalToQuantizedMap() { + return originalToQuantizedMap; + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/QuantizationInfo.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/QuantizationInfo.java new file mode 100644 index 000000000..e054805af --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/QuantizationInfo.java @@ -0,0 +1,151 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.utils.report.GATKReportTable; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; + +import java.util.Arrays; +import java.util.List; + +/** + * Class that encapsulates the information necessary for quality score quantization for BQSR + * + * @author carneiro + * @since 3/26/12 + */ +public class QuantizationInfo { + private List quantizedQuals; + private List empiricalQualCounts; + private int quantizationLevels; + + private QuantizationInfo(List quantizedQuals, List empiricalQualCounts, int quantizationLevels) { + this.quantizedQuals = quantizedQuals; + this.empiricalQualCounts = empiricalQualCounts; + this.quantizationLevels = quantizationLevels; + } + + public QuantizationInfo(List quantizedQuals, List empiricalQualCounts) { + this(quantizedQuals, empiricalQualCounts, calculateQuantizationLevels(quantizedQuals)); + } + + public QuantizationInfo(final RecalibrationTables recalibrationTables, final int quantizationLevels) { + final Long [] qualHistogram = new Long[QualityUtils.MAX_SAM_QUAL_SCORE +1]; // create a histogram with the empirical quality distribution + for (int i = 0; i < qualHistogram.length; i++) + qualHistogram[i] = 0L; + + final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); // get the quality score table + + for (final RecalDatum value : qualTable.getAllValues()) { + final RecalDatum datum = value; + final int empiricalQual = MathUtils.fastRound(datum.getEmpiricalQuality()); // convert the empirical quality to an integer ( it is already capped by MAX_QUAL ) + qualHistogram[empiricalQual] += (long) datum.getNumObservations(); // add the number of observations for every key + } + empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities + quantizeQualityScores(quantizationLevels); + + this.quantizationLevels = quantizationLevels; + } + + + public void quantizeQualityScores(int nLevels) { + QualQuantizer quantizer = new QualQuantizer(empiricalQualCounts, nLevels, QualityUtils.MIN_USABLE_Q_SCORE); // quantize the qualities to the desired number of levels + quantizedQuals = quantizer.getOriginalToQuantizedMap(); // map with the original to quantized qual map (using the standard number of levels in the RAC) + } + + public void noQuantization() { + this.quantizationLevels = QualityUtils.MAX_SAM_QUAL_SCORE; + for (int i = 0; i < this.quantizationLevels; i++) + quantizedQuals.set(i, (byte) i); + } + + public List getQuantizedQuals() { + return quantizedQuals; + } + + public int getQuantizationLevels() { + return quantizationLevels; + } + + public GATKReportTable generateReportTable(boolean sortByCols) { + GATKReportTable quantizedTable; + if(sortByCols) { + quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3, GATKReportTable.TableSortingWay.SORT_BY_COLUMN); + } else { + quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3); + } + quantizedTable.addColumn(RecalUtils.QUALITY_SCORE_COLUMN_NAME); + quantizedTable.addColumn(RecalUtils.QUANTIZED_COUNT_COLUMN_NAME); + quantizedTable.addColumn(RecalUtils.QUANTIZED_VALUE_COLUMN_NAME); + + for (int qual = 0; qual <= QualityUtils.MAX_SAM_QUAL_SCORE; qual++) { + quantizedTable.set(qual, RecalUtils.QUALITY_SCORE_COLUMN_NAME, qual); + quantizedTable.set(qual, RecalUtils.QUANTIZED_COUNT_COLUMN_NAME, empiricalQualCounts.get(qual)); + quantizedTable.set(qual, RecalUtils.QUANTIZED_VALUE_COLUMN_NAME, quantizedQuals.get(qual)); + } + return quantizedTable; + } + + private static int calculateQuantizationLevels(List quantizedQuals) { + byte lastByte = -1; + int quantizationLevels = 0; + for (byte q : quantizedQuals) { + if (q != lastByte) { + quantizationLevels++; + lastByte = q; + } + } + return quantizationLevels; + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/ReadCovariates.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/ReadCovariates.java new file mode 100644 index 000000000..c02dd4881 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/ReadCovariates.java @@ -0,0 +1,176 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.LRUCache; +import org.broadinstitute.gatk.utils.recalibration.EventType; + +/** + * The object temporarily held by a read that describes all of it's covariates. + * + * In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap + * + * @author Mauricio Carneiro + * @since 2/8/12 + */ +public class ReadCovariates { + private final static Logger logger = Logger.getLogger(ReadCovariates.class); + + /** + * How big should we let the LRU cache grow + */ + private static final int LRU_CACHE_SIZE = 500; + + /** + * Use an LRU cache to keep cache of keys (int[][][]) arrays for each read length we've seen. + * The cache allows us to avoid the expense of recreating these arrays for every read. The LRU + * keeps the total number of cached arrays to less than LRU_CACHE_SIZE. + * + * This is a thread local variable, so the total memory required may grow to N_THREADS x LRU_CACHE_SIZE + */ + private final static ThreadLocal> keysCache = new ThreadLocal>() { + @Override protected LRUCache initialValue() { + return new LRUCache(LRU_CACHE_SIZE); + } + }; + + /** + * The keys cache is only valid for a single covariate count. Normally this will remain constant for the analysis. + * If running multiple analyses (or the unit test suite), it's necessary to clear the cache. + */ + public static void clearKeysCache() { + keysCache.remove(); + } + + /** + * Our keys, indexed by event type x read length x covariate + */ + private final int[][][] keys; + + /** + * The index of the current covariate, used by addCovariate + */ + private int currentCovariateIndex = 0; + + public ReadCovariates(final int readLength, final int numberOfCovariates) { + final LRUCache cache = keysCache.get(); + final int[][][] cachedKeys = cache.get(readLength); + if ( cachedKeys == null ) { + // There's no cached value for read length so we need to create a new int[][][] array + if ( logger.isDebugEnabled() ) logger.debug("Keys cache miss for length " + readLength + " cache size " + cache.size()); + keys = new int[EventType.values().length][readLength][numberOfCovariates]; + cache.put(readLength, keys); + } else { + keys = cachedKeys; + } + } + + public void setCovariateIndex(final int index) { + currentCovariateIndex = index; + } + + /** + * Update the keys for mismatch, insertion, and deletion for the current covariate at read offset + * + * NOTE: no checks are performed on the number of covariates, for performance reasons. If the count increases + * after the keysCache has been accessed, this method will throw an ArrayIndexOutOfBoundsException. This currently + * only occurs in the testing harness, and we don't anticipate that it will become a part of normal runs. + * + * @param mismatch the mismatch key value + * @param insertion the insertion key value + * @param deletion the deletion key value + * @param readOffset the read offset, must be >= 0 and <= the read length used to create this ReadCovariates + */ + public void addCovariate(final int mismatch, final int insertion, final int deletion, final int readOffset) { + keys[EventType.BASE_SUBSTITUTION.ordinal()][readOffset][currentCovariateIndex] = mismatch; + keys[EventType.BASE_INSERTION.ordinal()][readOffset][currentCovariateIndex] = insertion; + keys[EventType.BASE_DELETION.ordinal()][readOffset][currentCovariateIndex] = deletion; + } + + /** + * Get the keys for all covariates at read position for error model + * + * @param readPosition + * @param errorModel + * @return + */ + public int[] getKeySet(final int readPosition, final EventType errorModel) { + return keys[errorModel.ordinal()][readPosition]; + } + + public int[][] getKeySet(final EventType errorModel) { + return keys[errorModel.ordinal()]; + } + + // ---------------------------------------------------------------------- + // + // routines for testing + // + // ---------------------------------------------------------------------- + + protected int[][] getMismatchesKeySet() { return getKeySet(EventType.BASE_SUBSTITUTION); } + protected int[][] getInsertionsKeySet() { return getKeySet(EventType.BASE_INSERTION); } + protected int[][] getDeletionsKeySet() { return getKeySet(EventType.BASE_DELETION); } + + protected int[] getMismatchesKeySet(final int readPosition) { + return getKeySet(readPosition, EventType.BASE_SUBSTITUTION); + } + + protected int[] getInsertionsKeySet(final int readPosition) { + return getKeySet(readPosition, EventType.BASE_INSERTION); + } + + protected int[] getDeletionsKeySet(final int readPosition) { + return getKeySet(readPosition, EventType.BASE_DELETION); + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalDatum.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalDatum.java new file mode 100644 index 000000000..c92ef1773 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalDatum.java @@ -0,0 +1,434 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; +import htsjdk.samtools.SAMUtils; +import org.apache.commons.math.optimization.fitting.GaussianFunction; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.QualityUtils; + + +/** + * An individual piece of recalibration data. Each bin counts up the number of observations and the number + * of reference mismatches seen for that combination of covariates. + * + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 3, 2009 + */ +@Invariant({ + "estimatedQReported >= 0.0", + "! Double.isNaN(estimatedQReported)", + "! Double.isInfinite(estimatedQReported)", + "empiricalQuality >= 0.0 || empiricalQuality == UNINITIALIZED", + "! Double.isNaN(empiricalQuality)", + "! Double.isInfinite(empiricalQuality)", + "numObservations >= 0", + "numMismatches >= 0", + "numMismatches <= numObservations" +}) +public class RecalDatum { + public final static byte MAX_RECALIBRATED_Q_SCORE = SAMUtils.MAX_PHRED_SCORE; + private static final double UNINITIALIZED = -1.0; + + /** + * estimated reported quality score based on combined data's individual q-reporteds and number of observations + */ + private double estimatedQReported; + + /** + * the empirical quality for datums that have been collapsed together (by read group and reported quality, for example) + */ + private double empiricalQuality; + + /** + * number of bases seen in total + */ + private long numObservations; + + /** + * number of bases seen that didn't match the reference + */ + private double numMismatches; + + /** + * used when calculating empirical qualities to avoid division by zero + */ + private static final int SMOOTHING_CONSTANT = 1; + + //--------------------------------------------------------------------------------------------------------------- + // + // constructors + // + //--------------------------------------------------------------------------------------------------------------- + + /** + * Create a new RecalDatum with given observation and mismatch counts, and an reported quality + * + * @param _numObservations observations + * @param _numMismatches mismatches + * @param reportedQuality Qreported + */ + public RecalDatum(final long _numObservations, final double _numMismatches, final byte reportedQuality) { + if ( _numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0"); + if ( _numMismatches < 0.0 ) throw new IllegalArgumentException("numMismatches < 0"); + if ( reportedQuality < 0 ) throw new IllegalArgumentException("reportedQuality < 0"); + + numObservations = _numObservations; + numMismatches = _numMismatches; + estimatedQReported = reportedQuality; + empiricalQuality = UNINITIALIZED; + } + + /** + * Copy copy into this recal datum, overwriting all of this objects data + * @param copy RecalDatum to copy + */ + public RecalDatum(final RecalDatum copy) { + this.numObservations = copy.getNumObservations(); + this.numMismatches = copy.getNumMismatches(); + this.estimatedQReported = copy.estimatedQReported; + this.empiricalQuality = copy.empiricalQuality; + } + + /** + * Add in all of the data from other into this object, updating the reported quality from the expected + * error rate implied by the two reported qualities + * + * @param other RecalDatum to combine + */ + public synchronized void combine(final RecalDatum other) { + final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors(); + increment(other.getNumObservations(), other.getNumMismatches()); + estimatedQReported = -10 * Math.log10(sumErrors / getNumObservations()); + empiricalQuality = UNINITIALIZED; + } + + public synchronized void setEstimatedQReported(final double estimatedQReported) { + if ( estimatedQReported < 0 ) throw new IllegalArgumentException("estimatedQReported < 0"); + if ( Double.isInfinite(estimatedQReported) ) throw new IllegalArgumentException("estimatedQReported is infinite"); + if ( Double.isNaN(estimatedQReported) ) throw new IllegalArgumentException("estimatedQReported is NaN"); + + this.estimatedQReported = estimatedQReported; + empiricalQuality = UNINITIALIZED; + } + + public final double getEstimatedQReported() { + return estimatedQReported; + } + public final byte getEstimatedQReportedAsByte() { + return (byte)(int)(Math.round(getEstimatedQReported())); + } + + //--------------------------------------------------------------------------------------------------------------- + // + // Empirical quality score -- derived from the num mismatches and observations + // + //--------------------------------------------------------------------------------------------------------------- + + /** + * Returns the error rate (in real space) of this interval, or 0 if there are no observations + * @return the empirical error rate ~= N errors / N obs + */ + @Ensures({"result >= 0.0"}) + public double getEmpiricalErrorRate() { + if ( numObservations == 0 ) + return 0.0; + else { + // cache the value so we don't call log over and over again + final double doubleMismatches = numMismatches + SMOOTHING_CONSTANT; + // smoothing is one error and one non-error observation, for example + final double doubleObservations = numObservations + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT; + return doubleMismatches / doubleObservations; + } + } + + public synchronized void setEmpiricalQuality(final double empiricalQuality) { + if ( empiricalQuality < 0 ) throw new IllegalArgumentException("empiricalQuality < 0"); + if ( Double.isInfinite(empiricalQuality) ) throw new IllegalArgumentException("empiricalQuality is infinite"); + if ( Double.isNaN(empiricalQuality) ) throw new IllegalArgumentException("empiricalQuality is NaN"); + + this.empiricalQuality = empiricalQuality; + } + + public final double getEmpiricalQuality() { + return getEmpiricalQuality(getEstimatedQReported()); + } + + public synchronized final double getEmpiricalQuality(final double conditionalPrior) { + if (empiricalQuality == UNINITIALIZED) { + calcEmpiricalQuality(conditionalPrior); + } + return empiricalQuality; + } + + public final byte getEmpiricalQualityAsByte() { + return (byte)(Math.round(getEmpiricalQuality())); + } + + //--------------------------------------------------------------------------------------------------------------- + // + // toString methods + // + //--------------------------------------------------------------------------------------------------------------- + + @Override + public String toString() { + return String.format("%d,%.2f,%.2f", getNumObservations(), getNumMismatches(), getEmpiricalQuality()); + } + + public String stringForCSV() { + return String.format("%s,%.2f,%.2f", toString(), getEstimatedQReported(), getEmpiricalQuality() - getEstimatedQReported()); + } + + //--------------------------------------------------------------------------------------------------------------- + // + // increment methods + // + //--------------------------------------------------------------------------------------------------------------- + + public final long getNumObservations() { + return numObservations; + } + + public final synchronized void setNumObservations(final long numObservations) { + if ( numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0"); + this.numObservations = numObservations; + empiricalQuality = UNINITIALIZED; + } + + public final double getNumMismatches() { + return numMismatches; + } + + @Requires({"numMismatches >= 0"}) + public final synchronized void setNumMismatches(final double numMismatches) { + if ( numMismatches < 0 ) throw new IllegalArgumentException("numMismatches < 0"); + this.numMismatches = numMismatches; + empiricalQuality = UNINITIALIZED; + } + + @Requires({"by >= 0"}) + public final synchronized void incrementNumObservations(final long by) { + numObservations += by; + empiricalQuality = UNINITIALIZED; + } + + @Requires({"by >= 0"}) + public final synchronized void incrementNumMismatches(final double by) { + numMismatches += by; + empiricalQuality = UNINITIALIZED; + } + + @Requires({"incObservations >= 0", "incMismatches >= 0"}) + @Ensures({"numObservations == old(numObservations) + incObservations", "numMismatches == old(numMismatches) + incMismatches"}) + public final synchronized void increment(final long incObservations, final double incMismatches) { + numObservations += incObservations; + numMismatches += incMismatches; + empiricalQuality = UNINITIALIZED; + } + + @Ensures({"numObservations == old(numObservations) + 1", "numMismatches >= old(numMismatches)"}) + public final synchronized void increment(final boolean isError) { + increment(1, isError ? 1.0 : 0.0); + } + + // ------------------------------------------------------------------------------------- + // + // Private implementation helper functions + // + // ------------------------------------------------------------------------------------- + + /** + * calculate the expected number of errors given the estimated Q reported and the number of observations + * in this datum. + * + * @return a positive (potentially fractional) estimate of the number of errors + */ + @Ensures("result >= 0.0") + private double calcExpectedErrors() { + return getNumObservations() * QualityUtils.qualToErrorProb(estimatedQReported); + } + + /** + * Calculate and cache the empirical quality score from mismatches and observations (expensive operation) + */ + @Requires("empiricalQuality == UNINITIALIZED") + @Ensures("empiricalQuality != UNINITIALIZED") + private synchronized void calcEmpiricalQuality(final double conditionalPrior) { + + // smoothing is one error and one non-error observation + final long mismatches = (long)(getNumMismatches() + 0.5) + SMOOTHING_CONSTANT; + final long observations = getNumObservations() + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT; + + final double empiricalQual = RecalDatum.bayesianEstimateOfEmpiricalQuality(observations, mismatches, conditionalPrior); + + // This is the old and busted point estimate approach: + //final double empiricalQual = -10 * Math.log10(getEmpiricalErrorRate()); + + empiricalQuality = Math.min(empiricalQual, (double) MAX_RECALIBRATED_Q_SCORE); + } + + //static final boolean DEBUG = false; + static private final double RESOLUTION_BINS_PER_QUAL = 1.0; + + static public double bayesianEstimateOfEmpiricalQuality(final long nObservations, final long nErrors, final double QReported) { + + final int numBins = (QualityUtils.MAX_REASONABLE_Q_SCORE + 1) * (int)RESOLUTION_BINS_PER_QUAL; + + final double[] log10Posteriors = new double[numBins]; + + for ( int bin = 0; bin < numBins; bin++ ) { + + final double QEmpOfBin = bin / RESOLUTION_BINS_PER_QUAL; + + log10Posteriors[bin] = log10QempPrior(QEmpOfBin, QReported) + log10QempLikelihood(QEmpOfBin, nObservations, nErrors); + + //if ( DEBUG ) + // System.out.println(String.format("bin = %d, Qreported = %f, nObservations = %f, nErrors = %f, posteriors = %f", bin, QReported, nObservations, nErrors, log10Posteriors[bin])); + } + + //if ( DEBUG ) + // System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f", QReported, nObservations, nErrors)); + + final double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10Posteriors); + final int MLEbin = MathUtils.maxElementIndex(normalizedPosteriors); + + final double Qemp = MLEbin / RESOLUTION_BINS_PER_QUAL; + return Qemp; + } + + /** + * Quals above this value should be capped down to this value (because they are too high) + * in the base quality score recalibrator + */ + public final static byte MAX_GATK_USABLE_Q_SCORE = 40; + static private final double[] log10QempPriorCache = new double[MAX_GATK_USABLE_Q_SCORE + 1]; + static { + // f(x) = a + b*exp(-((x - c)^2 / (2*d^2))) + // Note that b is the height of the curve's peak, c is the position of the center of the peak, and d controls the width of the "bell". + final double GF_a = 0.0; + final double GF_b = 0.9; + final double GF_c = 0.0; + final double GF_d = 0.5; // with these parameters, deltas can shift at most ~20 Q points + + final GaussianFunction gaussian = new GaussianFunction(GF_a, GF_b, GF_c, GF_d); + for ( int i = 0; i <= MAX_GATK_USABLE_Q_SCORE; i++ ) { + double log10Prior = Math.log10(gaussian.value((double) i)); + if ( Double.isInfinite(log10Prior) ) + log10Prior = -Double.MAX_VALUE; + log10QempPriorCache[i] = log10Prior; + } + } + + static protected double log10QempPrior(final double Qempirical, final double Qreported) { + final int difference = Math.min(Math.abs((int) (Qempirical - Qreported)), MAX_GATK_USABLE_Q_SCORE); + //if ( DEBUG ) + // System.out.println(String.format("Qemp = %f, log10Priors = %f", Qempirical, log10QempPriorCache[difference])); + return log10QempPriorCache[difference]; + } + + static private final long MAX_NUMBER_OF_OBSERVATIONS = Integer.MAX_VALUE - 1; + + static protected double log10QempLikelihood(final double Qempirical, long nObservations, long nErrors) { + if ( nObservations == 0 ) + return 0.0; + + // the binomial code requires ints as input (because it does caching). This should theoretically be fine because + // there is plenty of precision in 2^31 observations, but we need to make sure that we don't have overflow + // before casting down to an int. + if ( nObservations > MAX_NUMBER_OF_OBSERVATIONS ) { + // we need to decrease nErrors by the same fraction that we are decreasing nObservations + final double fraction = (double)MAX_NUMBER_OF_OBSERVATIONS / (double)nObservations; + nErrors = Math.round((double)nErrors * fraction); + nObservations = MAX_NUMBER_OF_OBSERVATIONS; + } + + // this is just a straight binomial PDF + double log10Prob = MathUtils.log10BinomialProbability((int)nObservations, (int)nErrors, QualityUtils.qualToErrorProbLog10(Qempirical)); + if ( Double.isInfinite(log10Prob) || Double.isNaN(log10Prob) ) + log10Prob = -Double.MAX_VALUE; + + //if ( DEBUG ) + // System.out.println(String.format("Qemp = %f, log10Likelihood = %f", Qempirical, log10Prob)); + + return log10Prob; + } +} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalDatumNode.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalDatumNode.java new file mode 100644 index 000000000..14b4c762b --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalDatumNode.java @@ -0,0 +1,582 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.commons.math.MathException; +import org.apache.commons.math.stat.inference.ChiSquareTestImpl; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.util.Collection; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.Set; + +/** + * A tree of recal datum, where each contains a set of sub datum representing sub-states of the higher level one + * + * @author Mark DePristo + * @since 07/27/12 + */ +public class RecalDatumNode { + private final static double SMALLEST_CHI2_PVALUE = 1e-300; + protected static final Logger logger = Logger.getLogger(RecalDatumNode.class); + + /** + * fixedPenalty is this value if it's considered fixed + */ + private final static double UNINITIALIZED = Double.NEGATIVE_INFINITY; + + private final T recalDatum; + private double fixedPenalty = UNINITIALIZED; + private final Set> subnodes; + + @Requires({"recalDatum != null"}) + public RecalDatumNode(final T recalDatum) { + this(recalDatum, new HashSet>()); + } + + @Override + public String toString() { + return recalDatum.toString(); + } + + @Requires({"recalDatum != null", "subnodes != null"}) + public RecalDatumNode(final T recalDatum, final Set> subnodes) { + this(recalDatum, UNINITIALIZED, subnodes); + } + + @Requires({"recalDatum != null"}) + protected RecalDatumNode(final T recalDatum, final double fixedPenalty) { + this(recalDatum, fixedPenalty, new HashSet>()); + } + + @Requires({"recalDatum != null", "subnodes != null"}) + protected RecalDatumNode(final T recalDatum, final double fixedPenalty, final Set> subnodes) { + this.recalDatum = recalDatum; + this.fixedPenalty = fixedPenalty; + this.subnodes = new HashSet>(subnodes); + } + + /** + * Get the recal data associated with this node + * @return + */ + @Ensures("result != null") + public T getRecalDatum() { + return recalDatum; + } + + /** + * The set of all subnodes of this tree. May be modified. + * @return + */ + @Ensures("result != null") + public Set> getSubnodes() { + return subnodes; + } + + /** + * Return the fixed penalty, if set, or else the the calculated penalty for this node + * @return + */ + public double getPenalty() { + if ( fixedPenalty != UNINITIALIZED ) + return fixedPenalty; + else + return calcPenalty(); + } + + /** + * Set the fixed penalty for this node to a fresh calculation from calcPenalty + * + * This is important in the case where you want to compute the penalty from a full + * tree and then chop the tree up afterwards while considering the previous penalties. + * If you don't call this function then manipulating the tree may result in the + * penalty functions changing with changes in the tree. + * + * @param doEntireTree recurse into all subnodes? + * @return the fixed penalty for this node + */ + public double calcAndSetFixedPenalty(final boolean doEntireTree) { + fixedPenalty = calcPenalty(); + if ( doEntireTree ) + for ( final RecalDatumNode sub : subnodes ) + sub.calcAndSetFixedPenalty(doEntireTree); + return fixedPenalty; + } + + /** + * Add node to the set of subnodes of this node + * @param sub + */ + @Requires("sub != null") + public void addSubnode(final RecalDatumNode sub) { + subnodes.add(sub); + } + + /** + * Is this a leaf node (i.e., has no subnodes)? + * @return + */ + public boolean isLeaf() { + return subnodes.isEmpty(); + } + + /** + * Is this node immediately above only leaf nodes? + * + * @return + */ + public boolean isAboveOnlyLeaves() { + for ( final RecalDatumNode sub : subnodes ) + if ( ! sub.isLeaf() ) + return false; + return true; + } + + /** + * What's the immediate number of subnodes from this node? + * @return + */ + @Ensures("result >= 0") + public int getNumSubnodes() { + return subnodes.size(); + } + + /** + * Total penalty is the sum of leaf node penalties + * + * This algorithm assumes that penalties have been fixed before pruning, as leaf nodes by + * definition have 0 penalty unless they represent a pruned tree with underlying -- but now + * pruned -- subtrees + * + * @return + */ + public double totalPenalty() { + if ( isLeaf() ) + return getPenalty(); + else { + double sum = 0.0; + for ( final RecalDatumNode sub : subnodes ) + sum += sub.totalPenalty(); + return sum; + } + } + + /** + * The maximum penalty among all nodes + * @return + */ + public double maxPenalty(final boolean leafOnly) { + double max = ! leafOnly || isLeaf() ? getPenalty() : Double.MIN_VALUE; + for ( final RecalDatumNode sub : subnodes ) + max = Math.max(max, sub.maxPenalty(leafOnly)); + return max; + } + + /** + * The minimum penalty among all nodes + * @return + */ + public double minPenalty(final boolean leafOnly) { + double min = ! leafOnly || isLeaf() ? getPenalty() : Double.MAX_VALUE; + for ( final RecalDatumNode sub : subnodes ) + min = Math.min(min, sub.minPenalty(leafOnly)); + return min; + } + + /** + * What's the longest branch from this node to any leaf? + * @return + */ + public int maxDepth() { + int subMax = 0; + for ( final RecalDatumNode sub : subnodes ) + subMax = Math.max(subMax, sub.maxDepth()); + return subMax + 1; + } + + /** + * What's the shortest branch from this node to any leaf? Includes this node + * @return + */ + @Ensures("result > 0") + public int minDepth() { + if ( isLeaf() ) + return 1; + else { + int subMin = Integer.MAX_VALUE; + for ( final RecalDatumNode sub : subnodes ) + subMin = Math.min(subMin, sub.minDepth()); + return subMin + 1; + } + } + + /** + * Return the number of nodes, including this one, reachable from this node + * @return + */ + @Ensures("result > 0") + public int size() { + int size = 1; + for ( final RecalDatumNode sub : subnodes ) + size += sub.size(); + return size; + } + + /** + * Count the number of leaf nodes reachable from this node + * + * @return + */ + @Ensures("result >= 0") + public int numLeaves() { + if ( isLeaf() ) + return 1; + else { + int size = 0; + for ( final RecalDatumNode sub : subnodes ) + size += sub.numLeaves(); + return size; + } + } + + /** + * Calculate the phred-scaled p-value for a chi^2 test for independent among subnodes of this node. + * + * The chi^2 value indicates the degree of independence of the implied error rates among the + * immediate subnodes + * + * @return the phred-scaled p-value for chi2 penalty, or 0.0 if it cannot be calculated + */ + private double calcPenalty() { + if ( isLeaf() || freeToMerge() ) + return 0.0; + else if ( subnodes.size() == 1 ) + // only one value, so its free to merge away + return 0.0; + else { + final long[][] counts = new long[subnodes.size()][2]; + + int i = 0; + for ( final RecalDatumNode subnode : subnodes ) { + // use the yates correction to help avoid all zeros => NaN + counts[i][0] = Math.round(subnode.getRecalDatum().getNumMismatches()) + 1L; + counts[i][1] = subnode.getRecalDatum().getNumObservations() + 2L; + i++; + } + + try { + final double chi2PValue = new ChiSquareTestImpl().chiSquareTest(counts); + final double penalty = -10.0 * Math.log10(Math.max(chi2PValue, SMALLEST_CHI2_PVALUE)); + + // make sure things are reasonable and fail early if not + if (Double.isInfinite(penalty) || Double.isNaN(penalty)) + throw new ReviewedGATKException("chi2 value is " + chi2PValue + " at " + getRecalDatum()); + + return penalty; + } catch ( MathException e ) { + throw new ReviewedGATKException("Failed in calculating chi2 value", e); + } + } + } + + /** + * Is this node free to merge because its rounded Q score is the same as all nodes below + * @return + */ + private boolean freeToMerge() { + if ( isLeaf() ) // leaves are free to merge + return true; + else { + final byte myQual = getRecalDatum().getEmpiricalQualityAsByte(); + for ( final RecalDatumNode sub : subnodes ) + if ( sub.getRecalDatum().getEmpiricalQualityAsByte() != myQual ) + return false; + return true; + } + } + + /** + * Calculate the penalty of this interval, given the overall error rate for the interval + * + * If the globalErrorRate is e, this value is: + * + * sum_i |log10(e_i) - log10(e)| * nObservations_i + * + * each the index i applies to all leaves of the tree accessible from this interval + * (found recursively from subnodes as necessary) + * + * @param globalErrorRate overall error rate in real space against which we calculate the penalty + * @return the cost of approximating the bins in this interval with the globalErrorRate + */ + @Requires("globalErrorRate >= 0.0") + @Ensures("result >= 0.0") + private double calcPenaltyLog10(final double globalErrorRate) { + if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty + return 0.0; + + if ( isLeaf() ) { + // this is leave node + return (Math.abs(Math.log10(recalDatum.getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * (double)recalDatum.getNumObservations(); + // TODO -- how we can generalize this calculation? +// if ( this.qEnd <= minInterestingQual ) +// // It's free to merge up quality scores below the smallest interesting one +// return 0; +// else { +// return (Math.abs(Math.log10(getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * getNumObservations(); +// } + } else { + double sum = 0; + for ( final RecalDatumNode hrd : subnodes) + sum += hrd.calcPenaltyLog10(globalErrorRate); + return sum; + } + } + + /** + * Return a freshly allocated tree prunes to have no more than maxDepth from the root to any leaf + * + * @param maxDepth + * @return + */ + public RecalDatumNode pruneToDepth(final int maxDepth) { + if ( maxDepth < 1 ) + throw new IllegalArgumentException("maxDepth < 1"); + else { + final Set> subPruned = new HashSet>(getNumSubnodes()); + if ( maxDepth > 1 ) + for ( final RecalDatumNode sub : subnodes ) + subPruned.add(sub.pruneToDepth(maxDepth - 1)); + return new RecalDatumNode(getRecalDatum(), fixedPenalty, subPruned); + } + } + + /** + * Return a freshly allocated tree with to no more than maxElements in order of penalty + * + * Note that nodes must have fixed penalties to this algorithm will fail. + * + * @param maxElements + * @return + */ + public RecalDatumNode pruneByPenalty(final int maxElements) { + RecalDatumNode root = this; + + while ( root.size() > maxElements ) { + // remove the lowest penalty element, and continue + root = root.removeLowestPenaltyNode(); + } + + // our size is below the target, so we are good, return + return root; + } + + /** + * Return a freshly allocated tree where all mergable nodes with < maxPenalty are merged + * + * Note that nodes must have fixed penalties to this algorithm will fail. + * + * @param maxPenaltyIn the maximum penalty we are allowed to incur for a merge + * @param applyBonferroniCorrection if true, we will adjust penalty by the phred-scaled bonferroni correction + * for the size of the initial tree. That is, if there are 10 nodes in the + * tree and maxPenalty is 20 we will actually enforce 10^-2 / 10 = 10^-3 = 30 + * penalty for multiple testing + * @return + */ + public RecalDatumNode pruneToNoMoreThanPenalty(final double maxPenaltyIn, final boolean applyBonferroniCorrection) { + RecalDatumNode root = this; + + final double bonferroniCorrection = 10 * Math.log10(this.size()); + final double maxPenalty = applyBonferroniCorrection ? maxPenaltyIn + bonferroniCorrection : maxPenaltyIn; + + if ( applyBonferroniCorrection ) + logger.info(String.format("Applying Bonferroni correction for %d nodes = %.2f to initial penalty %.2f for total " + + "corrected max penalty of %.2f", this.size(), bonferroniCorrection, maxPenaltyIn, maxPenalty)); + + while ( true ) { + final Pair, Double> minPenaltyNode = root.getMinPenaltyAboveLeafNode(); + + if ( minPenaltyNode == null || minPenaltyNode.getSecond() > maxPenalty ) { + // nothing to merge, or the best candidate is above our max allowed + if ( minPenaltyNode == null ) { + if ( logger.isDebugEnabled() ) logger.debug("Stopping because no candidates could be found"); + } else { + if ( logger.isDebugEnabled() ) logger.debug("Stopping because node " + minPenaltyNode.getFirst() + " has penalty " + minPenaltyNode.getSecond() + " > max " + maxPenalty); + } + break; + } else { + // remove the lowest penalty element, and continue + if ( logger.isDebugEnabled() ) logger.debug("Removing node " + minPenaltyNode.getFirst() + " with penalty " + minPenaltyNode.getSecond()); + root = root.removeLowestPenaltyNode(); + } + } + + // no more candidates exist with penalty < maxPenalty + return root; + } + + + /** + * Find the lowest penalty above leaf node in the tree, and return a tree without it + * + * Note this excludes the current (root) node + * + * @return + */ + private RecalDatumNode removeLowestPenaltyNode() { + final Pair, Double> nodeToRemove = getMinPenaltyAboveLeafNode(); + if ( logger.isDebugEnabled() ) + logger.debug("Removing " + nodeToRemove.getFirst() + " with penalty " + nodeToRemove.getSecond()); + + final Pair, Boolean> result = removeNode(nodeToRemove.getFirst()); + + if ( ! result.getSecond() ) + throw new IllegalStateException("Never removed any node!"); + + final RecalDatumNode oneRemoved = result.getFirst(); + if ( oneRemoved == null ) + throw new IllegalStateException("Removed our root node, wow, didn't expect that"); + return oneRemoved; + } + + /** + * Finds in the tree the node with the lowest penalty whose subnodes are all leaves + * + * @return the node and its penalty, or null if no such node exists + */ + private Pair, Double> getMinPenaltyAboveLeafNode() { + if ( isLeaf() ) + // not allowed to remove leafs directly + return null; + if ( isAboveOnlyLeaves() ) + // we only consider removing nodes above all leaves + return new Pair, Double>(this, getPenalty()); + else { + // just recurse, taking the result with the min penalty of all subnodes + Pair, Double> minNode = null; + for ( final RecalDatumNode sub : subnodes ) { + final Pair, Double> subFind = sub.getMinPenaltyAboveLeafNode(); + if ( subFind != null && (minNode == null || subFind.getSecond() < minNode.getSecond()) ) { + minNode = subFind; + } + } + return minNode; + } + } + + /** + * Return a freshly allocated tree without the node nodeToRemove + * + * @param nodeToRemove + * @return + */ + private Pair, Boolean> removeNode(final RecalDatumNode nodeToRemove) { + if ( this == nodeToRemove ) { + if ( isLeaf() ) + throw new IllegalStateException("Trying to remove a leaf node from the tree! " + this + " " + nodeToRemove); + // node is the thing we are going to remove, but without any subnodes + final RecalDatumNode node = new RecalDatumNode(getRecalDatum(), fixedPenalty); + return new Pair, Boolean>(node, true); + } else { + // did we remove something in a sub branch? + boolean removedSomething = false; + + // our sub nodes with the penalty node removed + final Set> sub = new HashSet>(getNumSubnodes()); + + for ( final RecalDatumNode sub1 : subnodes ) { + if ( removedSomething ) { + // already removed something, just add sub1 back to sub + sub.add(sub1); + } else { + // haven't removed anything yet, so try + final Pair, Boolean> maybeRemoved = sub1.removeNode(nodeToRemove); + removedSomething = maybeRemoved.getSecond(); + sub.add(maybeRemoved.getFirst()); + } + } + + final RecalDatumNode node = new RecalDatumNode(getRecalDatum(), fixedPenalty, sub); + return new Pair, Boolean>(node, removedSomething); + } + } + + /** + * Return a collection of all of the data in the leaf nodes of this tree + * + * @return + */ + public Collection getAllLeaves() { + final LinkedList list = new LinkedList(); + getAllLeavesRec(list); + return list; + } + + /** + * Helpful recursive function for getAllLeaves() + * + * @param list the destination for the list of leaves + */ + private void getAllLeavesRec(final LinkedList list) { + if ( isLeaf() ) + list.add(getRecalDatum()); + else { + for ( final RecalDatumNode sub : subnodes ) + sub.getAllLeavesRec(list); + } + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalUtils.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalUtils.java new file mode 100644 index 000000000..f2f33ee59 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalUtils.java @@ -0,0 +1,1097 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.engine.recalibration.covariates.*; +import org.broadinstitute.gatk.utils.report.GATKReport; +import org.broadinstitute.gatk.utils.report.GATKReportTable; +import org.broadinstitute.gatk.utils.classloader.JVMUtils; +import org.broadinstitute.gatk.utils.recalibration.EventType; +import org.broadinstitute.gatk.utils.BaseUtils; +import org.broadinstitute.gatk.utils.R.RScriptExecutor; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.classloader.PluginManager; +import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.exceptions.DynamicClassResolutionException; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.io.Resource; +import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.ReadUtils; + +import java.io.*; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 6, 2009 + * + * This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions. + * It also has static methods that are used to perform the various solid recalibration modes that attempt to correct the reference bias. + * This class holds the parsing methods that are shared between BaseRecalibrator and PrintReads. + */ + +public class RecalUtils { + public final static String ARGUMENT_REPORT_TABLE_TITLE = "Arguments"; + public final static String QUANTIZED_REPORT_TABLE_TITLE = "Quantized"; + public final static String READGROUP_REPORT_TABLE_TITLE = "RecalTable0"; + public final static String QUALITY_SCORE_REPORT_TABLE_TITLE = "RecalTable1"; + public final static String ALL_COVARIATES_REPORT_TABLE_TITLE = "RecalTable2"; + + public final static String ARGUMENT_COLUMN_NAME = "Argument"; + public final static String ARGUMENT_VALUE_COLUMN_NAME = "Value"; + public final static String QUANTIZED_VALUE_COLUMN_NAME = "QuantizedScore"; + public static final String QUANTIZED_COUNT_COLUMN_NAME = "Count"; + public final static String READGROUP_COLUMN_NAME = "ReadGroup"; + public final static String EVENT_TYPE_COLUMN_NAME = "EventType"; + public final static String EMPIRICAL_QUALITY_COLUMN_NAME = "EmpiricalQuality"; + public final static String ESTIMATED_Q_REPORTED_COLUMN_NAME = "EstimatedQReported"; + public final static String QUALITY_SCORE_COLUMN_NAME = "QualityScore"; + public final static String COVARIATE_VALUE_COLUMN_NAME = "CovariateValue"; + public final static String COVARIATE_NAME_COLUMN_NAME = "CovariateName"; + public final static String NUMBER_OBSERVATIONS_COLUMN_NAME = "Observations"; + public final static String NUMBER_ERRORS_COLUMN_NAME = "Errors"; + + private final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams + private final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color + private static boolean warnUserNullPlatform = false; + + private static final String SCRIPT_FILE = "BQSR.R"; + + private static final Pair covariateValue = new Pair(RecalUtils.COVARIATE_VALUE_COLUMN_NAME, "%s"); + private static final Pair covariateName = new Pair(RecalUtils.COVARIATE_NAME_COLUMN_NAME, "%s"); + private static final Pair eventType = new Pair(RecalUtils.EVENT_TYPE_COLUMN_NAME, "%s"); + private static final Pair empiricalQuality = new Pair(RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, "%.4f"); + private static final Pair estimatedQReported = new Pair(RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.4f"); + private static final Pair nObservations = new Pair(RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d"); + private static final Pair nErrors = new Pair(RecalUtils.NUMBER_ERRORS_COLUMN_NAME, "%.2f"); + + /** + * Generates two lists : required covariates and optional covariates based on the user's requests. + * + * Performs the following tasks in order: + * 1. Adds all requierd covariates in order + * 2. Check if the user asked to use the standard covariates and adds them all if that's the case + * 3. Adds all covariates requested by the user that were not already added by the two previous steps + * + * @param argumentCollection the argument collection object for the recalibration walker + * @return a pair of ordered lists : required covariates (first) and optional covariates (second) + */ + public static Pair, ArrayList> initializeCovariates(RecalibrationArgumentCollection argumentCollection) { + final List> covariateClasses = new PluginManager(Covariate.class).getPlugins(); + final List> requiredClasses = new PluginManager(RequiredCovariate.class).getPlugins(); + final List> standardClasses = new PluginManager(StandardCovariate.class).getPlugins(); + + final ArrayList requiredCovariates = addRequiredCovariatesToList(requiredClasses); // add the required covariates + ArrayList optionalCovariates = new ArrayList(); + if (!argumentCollection.DO_NOT_USE_STANDARD_COVARIATES) + optionalCovariates = addStandardCovariatesToList(standardClasses); // add the standard covariates if -standard was specified by the user + + // parse the -cov arguments that were provided, skipping over the ones already specified + if (argumentCollection.COVARIATES != null) { + for (String requestedCovariateString : argumentCollection.COVARIATES) { + // help the transition from BQSR v1 to BQSR v2 + if ( requestedCovariateString.equals("DinucCovariate") ) + throw new UserException.CommandLineException("DinucCovariate has been retired. Please use its successor covariate " + + "ContextCovariate instead, which includes the 2 bp (dinuc) substitution model of the retired DinucCovariate " + + "as well as an indel context to model the indel error rates"); + + boolean foundClass = false; + for (Class covClass : covariateClasses) { + if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class + foundClass = true; + if (!requiredClasses.contains(covClass) && + (argumentCollection.DO_NOT_USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) { + try { + final Covariate covariate = covClass.newInstance(); // now that we've found a matching class, try to instantiate it + optionalCovariates.add(covariate); + } catch (Exception e) { + throw new DynamicClassResolutionException(covClass, e); + } + } + } + } + + if (!foundClass) { + throw new UserException.CommandLineException("The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates."); + } + } + } + return new Pair, ArrayList>(requiredCovariates, optionalCovariates); + } + + /** + * Adds the required covariates to a covariate list + * + * Note: this method really only checks if the classes object has the expected number of required covariates, then add them by hand. + * + * @param classes list of classes to add to the covariate list + * @return the covariate list + */ + private static ArrayList addRequiredCovariatesToList(List> classes) { + ArrayList dest = new ArrayList(classes.size()); + if (classes.size() != 2) + throw new ReviewedGATKException("The number of required covariates has changed, this is a hard change in the code and needs to be inspected"); + + dest.add(new ReadGroupCovariate()); // enforce the order with RG first and QS next. + dest.add(new QualityScoreCovariate()); + return dest; + } + + /** + * Adds the standard covariates to a covariate list + * + * @param classes list of classes to add to the covariate list + * @return the covariate list + */ + private static ArrayList addStandardCovariatesToList(List> classes) { + ArrayList dest = new ArrayList(classes.size()); + for (Class covClass : classes) { + try { + final Covariate covariate = (Covariate) covClass.newInstance(); + dest.add(covariate); + } catch (Exception e) { + throw new DynamicClassResolutionException(covClass, e); + } + } + return dest; + } + + /** + * Print a list of all available covariates to logger as info + * + * @param logger + */ + public static void listAvailableCovariates(final Logger logger) { + logger.info("Available covariates:"); + for (final Class covClass : new PluginManager(Covariate.class).getPlugins()) { + logger.info(String.format("\t%30s\t%s", covClass.getSimpleName(), JVMUtils.classInterfaces(covClass))); + } + } + + /** + * Component used to print out csv representation of the reports that can be use to perform analysis in + * external tools. E.g. generate plots using R scripts. + *

+ * A header is always printed into the output stream (or file) when the printer is created. Then you only need + * to call {@link #print(RecalibrationReport,String) print} for each report you want to include in the csv file. + * Once finished, you close the printer calling {@link #close() close} + * + */ + private static class CsvPrinter { + + private final PrintStream ps; + private final Covariate[] covariates; + + /** + * Constructs a printer redirected to an output file. + * @param out the output file. + * @param c covariates to print out. + * @throws FileNotFoundException if the file could not be created anew. + */ + protected CsvPrinter(final File out, final Covariate ... c) + throws FileNotFoundException { + this(new FileOutputStream(out), c); + } + + /** + * Constructs a printer redirected to an output stream + * @param os the output. + * @param c covariates to print out. + */ + protected CsvPrinter(final OutputStream os, final Covariate ... c) { + covariates = c == null ? new Covariate[0] : c.clone(); + ps = new PrintStream(os); + printHeader(); + } + + /** + * Prints the header out. + *

+ * Should only be invoked at creation. + */ + protected void printHeader() { + RecalUtils.printHeader(ps); + } + + /** + * Prints out a report into the csv file. + * + * + * @param report the report to print out. + * @param mode the report associated mode. (typically ORIGINAL, RECALIBRATED + */ + public void print(final RecalibrationReport report, final String mode) { + RecalUtils.writeCSV(ps,report.getRecalibrationTables(),mode,covariates,false); + } + + /** + * Close the csv printer. + * + * No further output will be allowed or take place after calling this method. + */ + public void close() { + ps.close(); + } + + } + + /** + * Returns a csv output printer. + * + * @param out the output file. It will be overridden + * @param c list of covariates to print out. + * + * @throws FileNotFoundException if out could not be created anew. + * + * @return never null + */ + protected static CsvPrinter csvPrinter(final File out, final Covariate ... c) + throws FileNotFoundException + { + if (c == null) { + throw new IllegalArgumentException("the input covariate array cannot be null"); + } + return new CsvPrinter(out,c); + } + + /** + * Prints out a collection of reports into a file in Csv format in a way + * that can be used by R scripts (such as the plot generator script). + *

+ * The set of covariates is take as the minimum common set from all reports. + * + * @param out the output file. It will be overridden. + * @param reports map where keys are the unique 'mode' (ORIGINAL, RECALIBRATED, ...) + * of each report and the corresponding value the report itself. + * @throws FileNotFoundException if out could not be created anew. + */ + public static void generateCsv(final File out, final Map reports) + throws FileNotFoundException { + if (reports.size() == 0) { + writeCsv(out, reports, new Covariate[0]); + } else { + final Iterator rit = reports.values().iterator(); + final RecalibrationReport first = rit.next(); + final Covariate[] firstCovariates = first.getRequestedCovariates(); + final Set covariates = new LinkedHashSet<>(); + Utils.addAll(covariates,firstCovariates); + while (rit.hasNext() && covariates.size() > 0) { + final Covariate[] nextCovariates = rit.next().getRequestedCovariates(); + final Set nextCovariateNames = new LinkedHashSet(nextCovariates.length); + for (final Covariate nc : nextCovariates) { + nextCovariateNames.add(nc.getClass().getSimpleName()); + } + final Iterator cit = covariates.iterator(); + while (cit.hasNext()) { + if (!nextCovariateNames.contains(cit.next().getClass().getSimpleName())) { + cit.remove(); + } + } + } + writeCsv(out, reports, covariates.toArray(new Covariate[covariates.size()])); + } + } + + /** + * Print out a collection of reports into a file in Csv format in a way + * that can be used by R scripts (such as the plot generator script). + * + * @param out + * @param reports map where keys are the unique 'mode' (ORIGINAL, RECALIBRATED, ...) + * of each report and the corresponding value the report itself. + * @param c the covariates to print out. + * @throws FileNotFoundException if out could not be created anew. + */ + private static void writeCsv(final File out, + final Map reports, final Covariate[] c) + throws FileNotFoundException { + final CsvPrinter p = csvPrinter(out,c); + for (Map.Entry e : reports.entrySet()) { + p.print(e.getValue(),e.getKey()); + } + p.close(); + } + + public enum SOLID_RECAL_MODE { + /** + * Treat reference inserted bases as reference matching bases. Very unsafe! + */ + DO_NOTHING, + /** + * Set reference inserted bases and the previous base (because of color space alignment details) to Q0. This is the default option. + */ + SET_Q_ZERO, + /** + * In addition to setting the quality scores to zero, also set the base itself to 'N'. This is useful to visualize in IGV. + */ + SET_Q_ZERO_BASE_N, + /** + * Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference. + */ + REMOVE_REF_BIAS; + + public static SOLID_RECAL_MODE recalModeFromString(String recalMode) { + if (recalMode.equals("DO_NOTHING")) + return SOLID_RECAL_MODE.DO_NOTHING; + if (recalMode.equals("SET_Q_ZERO")) + return SOLID_RECAL_MODE.SET_Q_ZERO; + if (recalMode.equals("SET_Q_ZERO_BASE_N")) + return SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N; + if (recalMode.equals("REMOVE_REF_BIAS")) + return SOLID_RECAL_MODE.REMOVE_REF_BIAS; + + throw new UserException.BadArgumentValue(recalMode, "is not a valid SOLID_RECAL_MODE value"); + } + } + + public enum SOLID_NOCALL_STRATEGY { + /** + * When a no call is detected throw an exception to alert the user that recalibrating this SOLiD data is unsafe. This is the default option. + */ + THROW_EXCEPTION, + /** + * Leave the read in the output bam completely untouched. This mode is only okay if the no calls are very rare. + */ + LEAVE_READ_UNRECALIBRATED, + /** + * Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses. + */ + PURGE_READ; + + public static SOLID_NOCALL_STRATEGY nocallStrategyFromString(String nocallStrategy) { + if (nocallStrategy.equals("THROW_EXCEPTION")) + return SOLID_NOCALL_STRATEGY.THROW_EXCEPTION; + if (nocallStrategy.equals("LEAVE_READ_UNRECALIBRATED")) + return SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED; + if (nocallStrategy.equals("PURGE_READ")) + return SOLID_NOCALL_STRATEGY.PURGE_READ; + + throw new UserException.BadArgumentValue(nocallStrategy, "is not a valid SOLID_NOCALL_STRATEGY value"); + } + } + + private static List generateReportTables(final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) { + List result = new LinkedList(); + int reportTableIndex = 0; + int rowIndex = 0; + final Map covariateNameMap = new HashMap(requestedCovariates.length); + for (final Covariate covariate : requestedCovariates) + covariateNameMap.put(covariate, parseCovariateName(covariate)); + + for (int tableIndex = 0; tableIndex < recalibrationTables.numTables(); tableIndex++) { + + final ArrayList> columnNames = new ArrayList>(); // initialize the array to hold the column names + columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[0]), "%s")); // save the required covariate name so we can reference it in the future + if (tableIndex != RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) { + columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[1]), "%s")); // save the required covariate name so we can reference it in the future + if (tableIndex >= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) { + columnNames.add(covariateValue); + columnNames.add(covariateName); + } + } + + columnNames.add(eventType); // the order of these column names is important here + columnNames.add(empiricalQuality); + if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) + columnNames.add(estimatedQReported); // only the read group table needs the estimated Q reported + columnNames.add(nObservations); + columnNames.add(nErrors); + + final GATKReportTable reportTable; + if (tableIndex <= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) { + if(sortByCols) { + reportTable = new GATKReportTable("RecalTable" + reportTableIndex++, "", columnNames.size(), GATKReportTable.TableSortingWay.SORT_BY_COLUMN); + } else { + reportTable = new GATKReportTable("RecalTable" + reportTableIndex++, "", columnNames.size(), GATKReportTable.TableSortingWay.DO_NOT_SORT); + } + for (final Pair columnName : columnNames) + reportTable.addColumn(columnName.getFirst(), columnName.getSecond()); + rowIndex = 0; // reset the row index since we're starting with a new table + } else { + reportTable = result.get(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()); + } + + final NestedIntegerArray table = recalibrationTables.getTable(tableIndex); + for (final NestedIntegerArray.Leaf row : table.getAllLeaves()) { + final RecalDatum datum = (RecalDatum)row.value; + final int[] keys = row.keys; + + int columnIndex = 0; + int keyIndex = 0; + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), requestedCovariates[0].formatKey(keys[keyIndex++])); + if (tableIndex != RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) { + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), requestedCovariates[1].formatKey(keys[keyIndex++])); + if (tableIndex >= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) { + final Covariate covariate = requestedCovariates[tableIndex]; + + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), covariate.formatKey(keys[keyIndex++])); + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), covariateNameMap.get(covariate)); + } + } + + final EventType event = EventType.eventFrom(keys[keyIndex]); + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), event.toString()); + + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEmpiricalQuality()); + if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getNumObservations()); + reportTable.set(rowIndex, columnNames.get(columnIndex).getFirst(), datum.getNumMismatches()); + + rowIndex++; + } + result.add(reportTable); + } + + return result; + } + + private static String parseCovariateName(final Covariate covariate) { + return covariate.getClass().getSimpleName().split("Covariate")[0]; + } + + /** + * Return a human-readable string representing the used covariates + * + * @param requestedCovariates a vector of covariates + * @return a non-null comma-separated string + */ + public static String covariateNames(final Covariate[] requestedCovariates) { + final List names = new ArrayList(requestedCovariates.length); + for ( final Covariate cov : requestedCovariates ) + names.add(cov.getClass().getSimpleName()); + return Utils.join(",", names); + } + + /** + * Outputs the GATK report to RAC.RECAL_TABLE. + * + * @param RAC The list of shared command line arguments + * @param quantizationInfo Quantization info + * @param recalibrationTables Recalibration tables + * @param requestedCovariates The list of requested covariates + * @param sortByCols True to use GATKReportTable.TableSortingWay.SORT_BY_COLUMN, false to use GATKReportTable.TableSortingWay.DO_NOT_SORT + */ + public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) { + final GATKReport report = createRecalibrationGATKReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols)); + report.print(RAC.RECAL_TABLE); + } + + /** + * Creates a consolidated GATK report, first generating report tables. Report can then be written to a stream via GATKReport.print(PrintStream). + * + * @param argumentTable Argument table + * @param quantizationInfo Quantization info + * @param recalibrationTables Recalibration tables + * @param requestedCovariates The list of requested covariates + * @param sortByCols True to use GATKReportTable.TableSortingWay.SORT_BY_COLUMN, false to use GATKReportTable.TableSortingWay.DO_NOT_SORT + * @return GATK report + */ + public static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final boolean sortByCols) { + return createRecalibrationGATKReport(argumentTable, quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols)); + } + + /** + * Creates a consolidated GATK report from the tables. Report can then be written to a stream via GATKReport.print(PrintStream). + * + * @param argumentTable Argument table + * @param quantizationTable Quantization Table + * @param recalTables Other recal tables + * @return GATK report + */ + private static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final GATKReportTable quantizationTable, final List recalTables) { + final GATKReport report = new GATKReport(); + report.addTable(argumentTable); + report.addTable(quantizationTable); + report.addTables(recalTables); + return report; + } + + /** s + * Write recalibration plots into a file + * + * @param csvFile location of the intermediary file + * @param exampleReportFile where the report arguments are collected from. + * @param output result plot file name. + */ + public static void generatePlots(final File csvFile, final File exampleReportFile, final File output) { + final RScriptExecutor executor = new RScriptExecutor(); + executor.setExceptOnError(true); + executor.addScript(new Resource(SCRIPT_FILE, RecalUtils.class)); + executor.addArgs(csvFile.getAbsolutePath()); + executor.addArgs(exampleReportFile.getAbsolutePath()); + executor.addArgs(output.getAbsolutePath()); + Logger.getLogger(RecalUtils.class).debug("R command line: " + executor.getApproximateCommandLine()); + executor.exec(); + } + + private static void outputRecalibrationPlot(final File csvFile, final RecalibrationArgumentCollection RAC) { + + final RScriptExecutor executor = new RScriptExecutor(); + executor.addScript(new Resource(SCRIPT_FILE, RecalUtils.class)); + executor.addArgs(csvFile.getAbsolutePath()); + executor.addArgs(RAC.RECAL_TABLE_FILE.getAbsolutePath()); + executor.exec(); + } + + /** + * Please use {@link #generateCsv(java.io.File, java.util.Map)} and {@link #generatePlots(java.io.File, java.io.File, java.io.File)} instead. + * + * @deprecated + */ + @Deprecated + public static void generateRecalibrationPlot(final RecalibrationArgumentCollection RAC, final RecalibrationTables original, final Covariate[] requestedCovariates) { + generateRecalibrationPlot(RAC, original, null, requestedCovariates); + } + + /** + * Please use {@link #generateCsv(java.io.File, java.util.Map)} and {@link #generatePlots(java.io.File, java.io.File, java.io.File)} instead. + * + * @deprecated + */ + @Deprecated + public static void generateRecalibrationPlot(final RecalibrationArgumentCollection RAC, final RecalibrationTables original, final RecalibrationTables recalibrated, final Covariate[] requestedCovariates) { + final PrintStream csvStream; + final File csvTempFile = null; + try { + File csvTmpFile = File.createTempFile("BQSR",".csv"); + csvTmpFile.deleteOnExit(); + csvStream = new PrintStream(csvTmpFile); + } catch (IOException e) { + throw new UserException("Could not create temporary csv file", e); + } + + if ( recalibrated != null ) + writeCSV(csvStream, recalibrated, "RECALIBRATED", requestedCovariates, true); + writeCSV(csvStream, original, "ORIGINAL", requestedCovariates, recalibrated == null); + csvStream.close(); + outputRecalibrationPlot(csvTempFile, RAC); + csvTempFile.delete(); + } + + private static void writeCSV(final PrintStream deltaTableFile, final RecalibrationTables recalibrationTables, final String recalibrationMode, final Covariate[] requestedCovariates, final boolean printHeader) { + + final NestedIntegerArray deltaTable = createDeltaTable(recalibrationTables, requestedCovariates.length); + + // add the quality score table to the delta table + final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); + for (final NestedIntegerArray.Leaf leaf : qualTable.getAllLeaves()) { // go through every element in the covariates table to create the delta table + final int[] newCovs = new int[4]; + newCovs[0] = leaf.keys[0]; + newCovs[1] = requestedCovariates.length; // replace the covariate name with an arbitrary (unused) index for QualityScore + newCovs[2] = leaf.keys[1]; + newCovs[3] = leaf.keys[2]; + addToDeltaTable(deltaTable, newCovs, (RecalDatum)leaf.value); // add this covariate to the delta table + } + + // add the optional covariates to the delta table + for (int i = RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal(); i < requestedCovariates.length; i++) { + final NestedIntegerArray covTable = recalibrationTables.getTable(i); + for (final NestedIntegerArray.Leaf leaf : covTable.getAllLeaves()) { + final int[] covs = new int[4]; + covs[0] = leaf.keys[0]; + covs[1] = i; // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS) + covs[2] = leaf.keys[2]; + covs[3] = leaf.keys[3]; + addToDeltaTable(deltaTable, covs, (RecalDatum) leaf.value); // add this covariate to the delta table + } + } + + // output the csv file + if (printHeader) { + printHeader(deltaTableFile); + } + + final Map covariateNameMap = new HashMap(requestedCovariates.length); + for (final Covariate covariate : requestedCovariates) + covariateNameMap.put(covariate, parseCovariateName(covariate)); + + // print each data line + for (final NestedIntegerArray.Leaf leaf : deltaTable.getAllLeaves()) { + final List deltaKeys = generateValuesFromKeys(leaf.keys, requestedCovariates, covariateNameMap); + final RecalDatum deltaDatum = leaf.value; + deltaTableFile.print(Utils.join(",", deltaKeys)); + deltaTableFile.print("," + deltaDatum.stringForCSV()); + deltaTableFile.println("," + recalibrationMode); + } + } + + private static void printHeader(PrintStream out) { + final List header = new LinkedList(); + header.add("ReadGroup"); + header.add("CovariateValue"); + header.add("CovariateName"); + header.add("EventType"); + header.add("Observations"); + header.add("Errors"); + header.add("EmpiricalQuality"); + header.add("AverageReportedQuality"); + header.add("Accuracy"); + header.add("Recalibration"); + out.println(Utils.join(",", header)); + } + + /* + * Return an initialized nested integer array with appropriate dimensions for use with the delta tables + * + * @param recalibrationTables the recal tables + * @param numCovariates the total number of covariates being used + * @return a non-null nested integer array + */ + @Requires("recalibrationTables != null && numCovariates > 0") + @Ensures("result != null") + private static NestedIntegerArray createDeltaTable(final RecalibrationTables recalibrationTables, final int numCovariates) { + + final int[] dimensionsForDeltaTable = new int[4]; + + // initialize the dimensions with those of the qual table to start with + final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); + final int[] dimensionsOfQualTable = qualTable.getDimensions(); + dimensionsForDeltaTable[0] = dimensionsOfQualTable[0]; // num read groups + dimensionsForDeltaTable[1] = numCovariates + 1; // num covariates + dimensionsForDeltaTable[2] = dimensionsOfQualTable[1]; + dimensionsForDeltaTable[3] = dimensionsOfQualTable[2]; + + // now, update the dimensions based on the optional covariate tables as needed + for ( int i = RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal(); i < numCovariates; i++ ) { + final NestedIntegerArray covTable = recalibrationTables.getTable(i); + final int[] dimensionsOfCovTable = covTable.getDimensions(); + dimensionsForDeltaTable[2] = Math.max(dimensionsForDeltaTable[2], dimensionsOfCovTable[2]); + dimensionsForDeltaTable[3] = Math.max(dimensionsForDeltaTable[3], dimensionsOfCovTable[3]); + } + + return new NestedIntegerArray(dimensionsForDeltaTable); + } + + protected static List generateValuesFromKeys(final int[] keys, final Covariate[] covariates, final Map covariateNameMap) { + final List values = new ArrayList(4); + values.add(covariates[RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()].formatKey(keys[0])); + + final int covariateIndex = keys[1]; + final int covariateKey = keys[2]; + final Covariate covariate = covariateIndex == covariates.length ? covariates[RecalibrationTables.TableType.QUALITY_SCORE_TABLE.ordinal()] : covariates[covariateIndex]; + values.add(covariate.formatKey(covariateKey)); + values.add(covariateNameMap.get(covariate)); + values.add(EventType.eventFrom(keys[3]).prettyPrint()); + + return values; + } + + /** + * Updates the current RecalDatum element in the delta table. + * + * If it doesn't have an element yet, it creates an RecalDatum element and adds it to the delta table. + * + * @param deltaTable the delta table + * @param deltaKey the key to the table + * @param recalDatum the recal datum to combine with the accuracyDatum element in the table + */ + private static void addToDeltaTable(final NestedIntegerArray deltaTable, final int[] deltaKey, final RecalDatum recalDatum) { + final RecalDatum deltaDatum = deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key + if (deltaDatum == null) + // if we don't have a key yet, create a new one with the same values as the current datum + deltaTable.put(new RecalDatum(recalDatum), deltaKey); + else + // if we do have a datum, combine it with this one + deltaDatum.combine(recalDatum); + } + + /** + * Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string + * + * @param read The read to adjust + * @param RAC The list of shared command line arguments + */ + public static void parsePlatformForRead(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) { + GATKSAMReadGroupRecord readGroup = read.getReadGroup(); + + if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) { + readGroup.setPlatform(RAC.FORCE_PLATFORM); + } + + if (readGroup.getPlatform() == null) { + if (RAC.DEFAULT_PLATFORM != null) { + if (!warnUserNullPlatform) { + Utils.warnUser("The input .bam file contains reads with no platform information. " + + "Defaulting to platform = " + RAC.DEFAULT_PLATFORM + ". " + + "First observed at read with name = " + read.getReadName()); + warnUserNullPlatform = true; + } + readGroup.setPlatform(RAC.DEFAULT_PLATFORM); + } + else { + throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName()); + } + } + } + + /** + * Parse through the color space of the read and add a new tag to the SAMRecord that says which bases are + * inconsistent with the color space. If there is a no call in the color space, this method returns false meaning + * this read should be skipped + * + * @param strategy the strategy used for SOLID no calls + * @param read The SAMRecord to parse + * @return true if this read is consistent or false if this read should be skipped + */ + public static boolean isColorSpaceConsistent(final SOLID_NOCALL_STRATEGY strategy, final GATKSAMRecord read) { + if (!ReadUtils.isSOLiDRead(read)) // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base + return true; + + // Haven't calculated the inconsistency array yet for this read + if (read.getAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG) == null) { + final Object attr = read.getAttribute(RecalUtils.COLOR_SPACE_ATTRIBUTE_TAG); + if (attr != null) { + byte[] colorSpace; + if (attr instanceof String) + colorSpace = ((String) attr).getBytes(); + else + throw new UserException.MalformedBAM(read, String.format("Value encoded by %s in %s isn't a string!", RecalUtils.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); + + final boolean badColor = hasNoCallInColorSpace(colorSpace); + if (badColor) { + if (strategy == SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED) { + return false; // can't recalibrate a SOLiD read with no calls in the color space, and the user wants to skip over them + } + else if (strategy == SOLID_NOCALL_STRATEGY.PURGE_READ) { + read.setReadFailsVendorQualityCheckFlag(true); + return false; + } + } + + byte[] readBases = read.getReadBases(); // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read + if (read.getReadNegativeStrandFlag()) + readBases = BaseUtils.simpleReverseComplement(read.getReadBases()); + + final byte[] inconsistency = new byte[readBases.length]; + int i; + byte prevBase = colorSpace[0]; // The sentinel + for (i = 0; i < readBases.length; i++) { + final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[i + 1]); + inconsistency[i] = (byte) (thisBase == readBases[i] ? 0 : 1); + prevBase = readBases[i]; + } + read.setAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency); + } + else if (strategy == SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) // if the strategy calls for an exception, throw it + throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); + + else + return false; // otherwise, just skip the read + } + + return true; + } + + private static boolean hasNoCallInColorSpace(final byte[] colorSpace) { + final int length = colorSpace.length; + for (int i = 1; i < length; i++) { // skip the sentinal + final byte color = colorSpace[i]; + if (color != (byte) '0' && color != (byte) '1' && color != (byte) '2' && color != (byte) '3') { + return true; // There is a bad color in this SOLiD read + } + } + + return false; // There aren't any color no calls in this SOLiD read + } + + /** + * Given the base and the color calculate the next base in the sequence + * + * @param read the read + * @param prevBase The base + * @param color The color + * @return The next base in the sequence + */ + private static byte getNextBaseFromColor(GATKSAMRecord read, final byte prevBase, final byte color) { + switch (color) { + case '0': + return prevBase; + case '1': + return performColorOne(prevBase); + case '2': + return performColorTwo(prevBase); + case '3': + return performColorThree(prevBase); + default: + throw new UserException.MalformedBAM(read, "Unrecognized color space in SOLID read, color = " + (char) color + + " Unfortunately this bam file can not be recalibrated without full color space information because of potential reference bias."); + } + } + + /** + * Check if this base is inconsistent with its color space. If it is then SOLID inserted the reference here and we should reduce the quality + * + * @param read The read which contains the color space to check against + * @param offset The offset in the read at which to check + * @return Returns true if the base was inconsistent with the color space + */ + public static boolean isColorSpaceConsistent(final GATKSAMRecord read, final int offset) { + final Object attr = read.getAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG); + if (attr != null) { + final byte[] inconsistency = (byte[]) attr; + // NOTE: The inconsistency array is in the direction of the read, not aligned to the reference! + if (read.getReadNegativeStrandFlag()) { // Negative direction + return inconsistency[inconsistency.length - offset - 1] == (byte) 0; + } + else { // Forward direction + return inconsistency[offset] == (byte) 0; + } + + // This block of code is for if you want to check both the offset and the next base for color space inconsistency + //if( read.getReadNegativeStrandFlag() ) { // Negative direction + // if( offset == 0 ) { + // return inconsistency[0] != 0; + // } else { + // return (inconsistency[inconsistency.length - offset - 1] != 0) || (inconsistency[inconsistency.length - offset] != 0); + // } + //} else { // Forward direction + // if( offset == inconsistency.length - 1 ) { + // return inconsistency[inconsistency.length - 1] != 0; + // } else { + // return (inconsistency[offset] != 0) || (inconsistency[offset + 1] != 0); + // } + //} + + } + else { // No inconsistency array, so nothing is inconsistent + return true; + } + } + + /** + * Computes all requested covariates for every offset in the given read + * by calling covariate.getValues(..). + * + * It populates an array of covariate values where result[i][j] is the covariate + * value for the ith position in the read and the jth covariate in + * reqeustedCovariates list. + * + * @param read The read for which to compute covariate values. + * @param requestedCovariates The list of requested covariates. + * @return a matrix with all the covariates calculated for every base in the read + */ + public static ReadCovariates computeCovariates(final GATKSAMRecord read, final Covariate[] requestedCovariates) { + final ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), requestedCovariates.length); + computeCovariates(read, requestedCovariates, readCovariates); + return readCovariates; + } + + /** + * Computes all requested covariates for every offset in the given read + * by calling covariate.getValues(..). + * + * It populates an array of covariate values where result[i][j] is the covariate + * value for the ith position in the read and the jth covariate in + * reqeustedCovariates list. + * + * @param read The read for which to compute covariate values. + * @param requestedCovariates The list of requested covariates. + * @param resultsStorage The object to store the covariate values + */ + public static void computeCovariates(final GATKSAMRecord read, final Covariate[] requestedCovariates, final ReadCovariates resultsStorage) { + // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read + for (int i = 0; i < requestedCovariates.length; i++) { + resultsStorage.setCovariateIndex(i); + requestedCovariates[i].recordValues(read, resultsStorage); + } + } + + /** + * Perform a certain transversion (A <-> C or G <-> T) on the base. + * + * @param base the base [AaCcGgTt] + * @return the transversion of the base, or the input base if it's not one of the understood ones + */ + private static byte performColorOne(byte base) { + switch (base) { + case 'A': + case 'a': + return 'C'; + case 'C': + case 'c': + return 'A'; + case 'G': + case 'g': + return 'T'; + case 'T': + case 't': + return 'G'; + default: + return base; + } + } + + /** + * Perform a transition (A <-> G or C <-> T) on the base. + * + * @param base the base [AaCcGgTt] + * @return the transition of the base, or the input base if it's not one of the understood ones + */ + private static byte performColorTwo(byte base) { + switch (base) { + case 'A': + case 'a': + return 'G'; + case 'C': + case 'c': + return 'T'; + case 'G': + case 'g': + return 'A'; + case 'T': + case 't': + return 'C'; + default: + return base; + } + } + + /** + * Return the complement (A <-> T or C <-> G) of a base. + * + * @param base the base [AaCcGgTt] + * @return the complementary base, or the input base if it's not one of the understood ones + */ + private static byte performColorThree(byte base) { + switch (base) { + case 'A': + case 'a': + return 'T'; + case 'C': + case 'c': + return 'G'; + case 'G': + case 'g': + return 'C'; + case 'T': + case 't': + return 'A'; + default: + return base; + } + } + + /** + * Combines the recalibration data for table1 and table2 into table1 + * + * Note that table1 is the destination, so it is modified + * + * @param table1 the destination table to merge table2 into + * @param table2 the source table to merge into table1 + */ + public static void combineTables(final NestedIntegerArray table1, final NestedIntegerArray table2) { + if ( table1 == null ) throw new IllegalArgumentException("table1 cannot be null"); + if ( table2 == null ) throw new IllegalArgumentException("table2 cannot be null"); + if ( ! Arrays.equals(table1.getDimensions(), table2.getDimensions())) + throw new IllegalArgumentException("Table1 " + Utils.join(",", table1.getDimensions()) + " not equal to " + Utils.join(",", table2.getDimensions())); + + for (final NestedIntegerArray.Leaf row : table2.getAllLeaves()) { + final RecalDatum myDatum = table1.get(row.keys); + + if (myDatum == null) + table1.put(row.value, row.keys); + else + myDatum.combine(row.value); + } + } + + /** + * Increments the RecalDatum at the specified position in the specified table, or put a new item there + * if there isn't already one. + * + * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() + * to return false if another thread inserts a new item at our position in the middle of our put operation. + * + * @param table the table that holds/will hold our item + * @param qual qual for this event + * @param isError error value for this event + * @param keys location in table of our item + */ + public static void incrementDatumOrPutIfNecessary( final NestedIntegerArray table, + final byte qual, + final double isError, + final int... keys ) { + final RecalDatum existingDatum = table.get(keys); + + if ( existingDatum == null ) { + // No existing item, try to put a new one + if ( ! table.put(createDatumObject(qual, isError), keys) ) { + // Failed to put a new item because another thread came along and put an item here first. + // Get the newly-put item and increment it (item is guaranteed to exist at this point) + table.get(keys).increment(1L, isError); + } + } + else { + // Easy case: already an item here, so increment it + existingDatum.increment(1L, isError); + } + } + + /** + * creates a datum object with one observation and one or zero error + * + * @param reportedQual the quality score reported by the instrument for this base + * @param isError whether or not the observation is an error + * @return a new RecalDatum object with the observation and the error + */ + private static RecalDatum createDatumObject(final byte reportedQual, final double isError) { + return new RecalDatum(1, isError, reportedQual); + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationArgumentCollection.java new file mode 100644 index 000000000..dcf7ed737 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationArgumentCollection.java @@ -0,0 +1,425 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import com.google.java.contract.Requires; +import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.utils.commandline.*; +import org.broadinstitute.gatk.utils.report.GATKReportTable; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.exceptions.GATKException; + +import java.io.File; +import java.io.PrintStream; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 27, 2009 + * + * A collection of the arguments that are used for BQSR. Used to be common to both CovariateCounterWalker and TableRecalibrationWalker. + * This set of arguments will also be passed to the constructor of every Covariate when it is instantiated. + */ + +public class RecalibrationArgumentCollection implements Cloneable { + + /** + * This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference, + * so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites. This tool accepts any number of RodBindings (VCF, Bed, etc.) + * for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites. + * Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument. + */ + @Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false) + public List> knownSites = Collections.emptyList(); + + /** + * After the header, data records occur one per line until the end of the file. The first several items on a line are the + * values of the individual covariates and will change depending on which covariates were specified at runtime. The last + * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches, + * and the raw empirical quality score calculated by phred-scaling the mismatch rate. Use '/dev/stdout' to print to standard out. + */ + @Gather(BQSRGatherer.class) + @Output(doc = "The output recalibration table file to create", required = true) + public File RECAL_TABLE_FILE = null; + public PrintStream RECAL_TABLE; + + /** + * Note that the --list argument requires a fully resolved and correct command-line to work. + */ + @Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false) + public boolean LIST_ONLY = false; + + /** + * Note that the ReadGroup and QualityScore covariates are required and do not need to be specified. + * Also, unless --no_standard_covs is specified, the Cycle and Context covariates are standard and are included by default. + * Use the --list argument to see the available covariates. + */ + @Argument(fullName = "covariate", shortName = "cov", doc = "One or more covariates to be used in the recalibration. Can be specified multiple times", required = false) + public String[] COVARIATES = null; + + /* + * The Cycle and Context covariates are standard and are included by default unless this argument is provided. + * Note that the ReadGroup and QualityScore covariates are required and cannot be excluded. + */ + @Argument(fullName = "no_standard_covs", shortName = "noStandard", doc = "Do not use the standard set of covariates, but rather just the ones listed using the -cov argument", required = false) + public boolean DO_NOT_USE_STANDARD_COVARIATES = false; + + /** + * This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option. + */ + @Advanced + @Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.") + public boolean RUN_WITHOUT_DBSNP = false; + + /** + * BaseRecalibrator accepts a --solid_recal_mode flag which governs how the recalibrator handles the + * reads which have had the reference inserted because of color space inconsistencies. + */ + @Argument(fullName = "solid_recal_mode", shortName = "sMode", required = false, doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS") + public RecalUtils.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalUtils.SOLID_RECAL_MODE.SET_Q_ZERO; + + /** + * BaseRecalibrator accepts a --solid_nocall_strategy flag which governs how the recalibrator handles + * no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in + * their color space tag can not be recalibrated. + */ + @Argument(fullName = "solid_nocall_strategy", shortName = "solid_nocall_strategy", doc = "Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required = false) + public RecalUtils.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalUtils.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION; + + /** + * The context covariate will use a context of this size to calculate its covariate value for base mismatches. Must be between 1 and 13 (inclusive). Note that higher values will increase runtime and required java heap size. + */ + @Argument(fullName = "mismatches_context_size", shortName = "mcs", doc = "Size of the k-mer context to be used for base mismatches", required = false) + public int MISMATCHES_CONTEXT_SIZE = 2; + + /** + * The context covariate will use a context of this size to calculate its covariate value for base insertions and deletions. Must be between 1 and 13 (inclusive). Note that higher values will increase runtime and required java heap size. + */ + @Argument(fullName = "indels_context_size", shortName = "ics", doc = "Size of the k-mer context to be used for base insertions and deletions", required = false) + public int INDELS_CONTEXT_SIZE = 3; + + /** + * The cycle covariate will generate an error if it encounters a cycle greater than this value. + * This argument is ignored if the Cycle covariate is not used. + */ + @Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "The maximum cycle value permitted for the Cycle covariate", required = false) + public int MAXIMUM_CYCLE_VALUE = 500; + + /** + * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is off] + */ + @Advanced + @Argument(fullName = "mismatches_default_quality", shortName = "mdq", doc = "default quality for the base mismatches covariate", required = false) + public byte MISMATCHES_DEFAULT_QUALITY = -1; + + /** + * A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used for all reads without insertion quality scores for each base. [default is on] + */ + @Advanced + @Argument(fullName = "insertions_default_quality", shortName = "idq", doc = "default quality for the base insertions covariate", required = false) + public byte INSERTIONS_DEFAULT_QUALITY = 45; + + /** + * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is on] + */ + @Advanced + @Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false) + public byte DELETIONS_DEFAULT_QUALITY = 45; + + /** + * Reads with low quality bases on either tail (beginning or end) will not be considered in the context. This parameter defines the quality below which (inclusive) a tail is considered low quality + */ + @Advanced + @Argument(fullName = "low_quality_tail", shortName = "lqt", doc = "minimum quality for the bases in the tail of the reads to be considered", required = false) + public byte LOW_QUAL_TAIL = 2; + + /** + * BQSR generates a quantization table for quick quantization later by subsequent tools. BQSR does not quantize the base qualities, this is done by the engine with the -qq or -BQSR options. + * This parameter tells BQSR the number of levels of quantization to use to build the quantization table. + */ + @Advanced + @Argument(fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output") + public int QUANTIZING_LEVELS = 16; + + /** + * The tag name for the binary tag covariate (if using it) + */ + @Advanced + @Argument(fullName = "binary_tag_name", shortName = "bintag", required = false, doc = "the binary tag covariate name if using it") + public String BINARY_TAG_NAME = null; + + /** + * Whether GATK report tables should have rows in sorted order, starting from leftmost column + */ + @Argument(fullName = "sort_by_all_columns", shortName = "sortAllCols", doc = "Sort the rows in the tables of reports", required = false) + public Boolean SORT_BY_ALL_COLUMNS = false; + + ///////////////////////////// + // Debugging-only Arguments + ///////////////////////////// + + @Hidden + @Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") + public String DEFAULT_PLATFORM = null; + + @Hidden + @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") + public String FORCE_PLATFORM = null; + + @Hidden + @Argument(fullName = "force_readgroup", shortName = "fRG", required = false, doc = "If provided, the read group of EVERY read will be forced to be the provided String.") + public String FORCE_READGROUP = null; + + @Hidden + @Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only", defaultToStdout = false) + public PrintStream RECAL_TABLE_UPDATE_LOG = null; + + /** + * The repeat covariate will use a context of this size to calculate its covariate value for base insertions and deletions + */ + @Hidden + @Argument(fullName = "max_str_unit_length", shortName = "maxstr", doc = "Max size of the k-mer context to be used for repeat covariates", required = false) + public int MAX_STR_UNIT_LENGTH = 8; + + @Hidden + @Argument(fullName = "max_repeat_length", shortName = "maxrep", doc = "Max number of repetitions to be used for repeat covariates", required = false) + public int MAX_REPEAT_LENGTH = 20; + + + public File existingRecalibrationReport = null; + + public GATKReportTable generateReportTable(final String covariateNames) { + GATKReportTable argumentsTable; + if(SORT_BY_ALL_COLUMNS) { + argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2, GATKReportTable.TableSortingWay.SORT_BY_COLUMN); + } else { + argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2); + } + argumentsTable.addColumn("Argument"); + argumentsTable.addColumn(RecalUtils.ARGUMENT_VALUE_COLUMN_NAME); + argumentsTable.addRowID("covariate", true); + argumentsTable.set("covariate", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, covariateNames); + argumentsTable.addRowID("no_standard_covs", true); + argumentsTable.set("no_standard_covs", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DO_NOT_USE_STANDARD_COVARIATES); + argumentsTable.addRowID("run_without_dbsnp", true); + argumentsTable.set("run_without_dbsnp", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, RUN_WITHOUT_DBSNP); + argumentsTable.addRowID("solid_recal_mode", true); + argumentsTable.set("solid_recal_mode", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_RECAL_MODE); + argumentsTable.addRowID("solid_nocall_strategy", true); + argumentsTable.set("solid_nocall_strategy", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_NOCALL_STRATEGY); + argumentsTable.addRowID("mismatches_context_size", true); + argumentsTable.set("mismatches_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_CONTEXT_SIZE); + argumentsTable.addRowID("indels_context_size", true); + argumentsTable.set("indels_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INDELS_CONTEXT_SIZE); + argumentsTable.addRowID("mismatches_default_quality", true); + argumentsTable.set("mismatches_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_DEFAULT_QUALITY); + argumentsTable.addRowID("deletions_default_quality", true); + argumentsTable.set("deletions_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DELETIONS_DEFAULT_QUALITY); + argumentsTable.addRowID("insertions_default_quality", true); + argumentsTable.set("insertions_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_DEFAULT_QUALITY); + argumentsTable.addRowID("maximum_cycle_value", true); + argumentsTable.set("maximum_cycle_value", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MAXIMUM_CYCLE_VALUE); + argumentsTable.addRowID("low_quality_tail", true); + argumentsTable.set("low_quality_tail", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, LOW_QUAL_TAIL); + argumentsTable.addRowID("default_platform", true); + argumentsTable.set("default_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DEFAULT_PLATFORM); + argumentsTable.addRowID("force_platform", true); + argumentsTable.set("force_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM); + argumentsTable.addRowID("quantizing_levels", true); + argumentsTable.set("quantizing_levels", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS); + argumentsTable.addRowID("recalibration_report", true); + argumentsTable.set("recalibration_report", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, existingRecalibrationReport == null ? "null" : existingRecalibrationReport.getAbsolutePath()); + argumentsTable.addRowID("binary_tag_name", true); + argumentsTable.set("binary_tag_name", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, BINARY_TAG_NAME == null ? "null" : BINARY_TAG_NAME); + return argumentsTable; + } + + /** + * Returns a map with the arguments that differ between this an + * another {@link RecalibrationArgumentCollection} instance. + *

+ * The key is the name of that argument in the report file. The value is a message + * that explains the difference to the end user. + *

+ * Thus, a empty map indicates that there is no differences between both argument collection that + * is relevant to report comparison. + *

+ * This method should not throw any exception. + * + * @param other the argument-collection to compare against. + * @param thisRole the name used to refer to this RAC report that makes sense to the end user. + * @param otherRole the name used to refer to the other RAC report that makes sense to the end user. + * + * @return never null, but a zero-size collection if there are no differences. + */ + @Requires("other != null && thisRole != null && otherRole != null && !thisRole.equalsIgnoreCase(otherRole)") + public Map compareReportArguments(final RecalibrationArgumentCollection other,final String thisRole, final String otherRole) { + final Map result = new LinkedHashMap<>(15); + compareRequestedCovariates(result, other, thisRole, otherRole); + compareSimpleReportArgument(result,"no_standard_covs", DO_NOT_USE_STANDARD_COVARIATES, other.DO_NOT_USE_STANDARD_COVARIATES, thisRole, otherRole); + compareSimpleReportArgument(result,"run_without_dbsnp",RUN_WITHOUT_DBSNP,other.RUN_WITHOUT_DBSNP,thisRole,otherRole); + compareSimpleReportArgument(result,"solid_recal_mode", SOLID_RECAL_MODE, other.SOLID_RECAL_MODE,thisRole,otherRole); + compareSimpleReportArgument(result,"solid_nocall_strategy", SOLID_NOCALL_STRATEGY, other.SOLID_NOCALL_STRATEGY,thisRole,otherRole); + compareSimpleReportArgument(result,"mismatches_context_size", MISMATCHES_CONTEXT_SIZE,other.MISMATCHES_CONTEXT_SIZE,thisRole,otherRole); + compareSimpleReportArgument(result,"mismatches_default_quality", MISMATCHES_DEFAULT_QUALITY, other.MISMATCHES_DEFAULT_QUALITY,thisRole,otherRole); + compareSimpleReportArgument(result,"deletions_default_quality", DELETIONS_DEFAULT_QUALITY, other.DELETIONS_DEFAULT_QUALITY,thisRole,otherRole); + compareSimpleReportArgument(result,"insertions_default_quality", INSERTIONS_DEFAULT_QUALITY, other.INSERTIONS_DEFAULT_QUALITY,thisRole,otherRole); + compareSimpleReportArgument(result,"maximum_cycle_value", MAXIMUM_CYCLE_VALUE, other.MAXIMUM_CYCLE_VALUE,thisRole,otherRole); + compareSimpleReportArgument(result,"low_quality_tail", LOW_QUAL_TAIL, other.LOW_QUAL_TAIL,thisRole,otherRole); + compareSimpleReportArgument(result,"default_platform", DEFAULT_PLATFORM, other.DEFAULT_PLATFORM,thisRole,otherRole); + compareSimpleReportArgument(result,"force_platform", FORCE_PLATFORM, other.FORCE_PLATFORM,thisRole,otherRole); + compareSimpleReportArgument(result,"quantizing_levels", QUANTIZING_LEVELS, other.QUANTIZING_LEVELS,thisRole,otherRole); + compareSimpleReportArgument(result,"binary_tag_name", BINARY_TAG_NAME, other.BINARY_TAG_NAME,thisRole,otherRole); + return result; + } + + + /** + * Compares the covariate report lists. + * + * @param diffs map where to annotate the difference. + * @param other the argument collection to compare against. + * @param thisRole the name for this argument collection that makes sense to the user. + * @param otherRole the name for the other argument collection that makes sense to the end user. + * + * @return true if a difference was found. + */ + @Requires("diffs != null && other != null && thisRole != null && otherRole != null") + private boolean compareRequestedCovariates(final Map diffs, + final RecalibrationArgumentCollection other, final String thisRole, final String otherRole) { + + final Set beforeNames = new HashSet<>(this.COVARIATES.length); + final Set afterNames = new HashSet<>(other.COVARIATES.length); + Utils.addAll(beforeNames, this.COVARIATES); + Utils.addAll(afterNames,other.COVARIATES); + final Set intersect = new HashSet<>(Math.min(beforeNames.size(),afterNames.size())); + intersect.addAll(beforeNames); + intersect.retainAll(afterNames); + + String diffMessage = null; + if (intersect.size() == 0) { // In practice this is not possible due to required covariates but... + diffMessage = String.format("There are no common covariates between '%s' and '%s'" + + " recalibrator reports. Covariates in '%s': {%s}. Covariates in '%s': {%s}.",thisRole,otherRole, + thisRole,Utils.join(", ",this.COVARIATES), + otherRole,Utils.join(",",other.COVARIATES)); + } else if (intersect.size() != beforeNames.size() || intersect.size() != afterNames.size()) { + beforeNames.removeAll(intersect); + afterNames.removeAll(intersect); + diffMessage = String.format("There are differences in the set of covariates requested in the" + + " '%s' and '%s' recalibrator reports. " + + " Exclusive to '%s': {%s}. Exclusive to '%s': {%s}.",thisRole,otherRole, + thisRole,Utils.join(", ",beforeNames), + otherRole,Utils.join(", ",afterNames)); + } + if (diffMessage != null) { + diffs.put("covariate",diffMessage); + return true; + } else { + return false; + } + } + + /** + * Annotates a map with any difference encountered in a simple value report argument that differs between this an + * another {@link RecalibrationArgumentCollection} instance. + *

+ * The key of the new entry would be the name of that argument in the report file. The value is a message + * that explains the difference to the end user. + *

+ * + *

+ * This method should not return any exception. + * + * @param diffs where to annotate the differences. + * @param name the name of the report argument to compare. + * @param thisValue this argument collection value for that argument. + * @param otherValue the other collection value for that argument. + * @param thisRole the name used to refer to this RAC report that makes sense to the end user. + * @param otherRole the name used to refer to the other RAC report that makes sense to the end user. + * + * @type T the argument Object value type. + * + * @return true if a difference has been spotted, thus diff has been modified. + */ + private boolean compareSimpleReportArgument(final Map diffs, + final String name, final T thisValue, final T otherValue, final String thisRole, final String otherRole) { + if (thisValue == null && otherValue == null) { + return false; + } else if (thisValue != null && thisValue.equals(otherValue)) { + return false; + } else { + diffs.put(name, + String.format("differences between '%s' {%s} and '%s' {%s}.", + thisRole,thisValue == null ? "" : thisValue, + otherRole,otherValue == null ? "" : otherValue)); + return true; + } + + } + + /** + * Create a shallow copy of this argument collection. + * + * @return never null. + */ + @Override + public RecalibrationArgumentCollection clone() { + try { + return (RecalibrationArgumentCollection) super.clone(); + } catch (CloneNotSupportedException e) { + throw new GATKException("Unreachable code clone not supported thrown when the class " + + this.getClass().getName() + " is cloneable ",e); + } + } + +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationReport.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationReport.java new file mode 100644 index 000000000..a9b401c2b --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationReport.java @@ -0,0 +1,425 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.utils.report.GATKReport; +import org.broadinstitute.gatk.utils.report.GATKReportTable; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.recalibration.EventType; +import org.broadinstitute.gatk.engine.recalibration.covariates.Covariate; + +import java.io.*; +import java.util.*; + +/** + * This class has all the static functionality for reading a recalibration report file into memory. + * + * @author carneiro + * @since 3/26/12 + */ +public class RecalibrationReport { + private QuantizationInfo quantizationInfo; // histogram containing the counts for qual quantization (calculated after recalibration is done) + private final RecalibrationTables recalibrationTables; // quick access reference to the tables + private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation + private final HashMap optionalCovariateIndexes; + + private final GATKReportTable argumentTable; // keep the argument table untouched just for output purposes + private final RecalibrationArgumentCollection RAC; // necessary for quantizing qualities with the same parameter + + private final int[] tempRGarray = new int[2]; + private final int[] tempQUALarray = new int[3]; + private final int[] tempCOVarray = new int[4]; + + public RecalibrationReport(final File recalFile) { + this(recalFile, getReadGroups(recalFile)); + } + + public RecalibrationReport(final File recalFile, final SortedSet allReadGroups) { + final GATKReport report = new GATKReport(recalFile); + + argumentTable = report.getTable(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE); + RAC = initializeArgumentCollectionTable(argumentTable); + + GATKReportTable quantizedTable = report.getTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE); + quantizationInfo = initializeQuantizationTable(quantizedTable); + + Pair, ArrayList> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates + ArrayList requiredCovariates = covariates.getFirst(); + ArrayList optionalCovariates = covariates.getSecond(); + requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; + optionalCovariateIndexes = new HashMap(optionalCovariates.size()); + int covariateIndex = 0; + for (final Covariate covariate : requiredCovariates) + requestedCovariates[covariateIndex++] = covariate; + for (final Covariate covariate : optionalCovariates) { + requestedCovariates[covariateIndex] = covariate; + final String covariateName = covariate.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport + optionalCovariateIndexes.put(covariateName, covariateIndex-2); + covariateIndex++; + } + + for (Covariate cov : requestedCovariates) + cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection + + recalibrationTables = new RecalibrationTables(requestedCovariates, allReadGroups.size()); + + initializeReadGroupCovariates(allReadGroups); + + parseReadGroupTable(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE), recalibrationTables.getReadGroupTable()); + + parseQualityScoreTable(report.getTable(RecalUtils.QUALITY_SCORE_REPORT_TABLE_TITLE), recalibrationTables.getQualityScoreTable()); + + parseAllCovariatesTable(report.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE), recalibrationTables); + + } + + /** + * Gets the unique read groups in the recal file + * + * @param recalFile the recal file as a GATK Report + * @return the unique read groups + */ + public static SortedSet getReadGroups(final File recalFile) { + return getReadGroups(new GATKReport(recalFile)); + } + + /** + * Gets the unique read groups in the table + * + * @param report the GATKReport containing the table with RecalUtils.READGROUP_REPORT_TABLE_TITLE + * @return the unique read groups + */ + private static SortedSet getReadGroups(final GATKReport report) { + final GATKReportTable reportTable = report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE); + final SortedSet readGroups = new TreeSet(); + for ( int i = 0; i < reportTable.getNumRows(); i++ ) + readGroups.add(reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME).toString()); + return readGroups; + } + + /** + * Combines two recalibration reports by adding all observations and errors + * + * Note: This method DOES NOT recalculate the empirical qualities and quantized qualities. You have to recalculate + * them after combining. The reason for not calculating it is because this function is intended for combining a + * series of recalibration reports, and it only makes sense to calculate the empirical qualities and quantized + * qualities after all the recalibration reports have been combined. Having the user recalculate when appropriate, + * makes this method faster + * + * Note2: The empirical quality reported, however, is recalculated given its simplicity. + * + * @param other the recalibration report to combine with this one + */ + public void combine(final RecalibrationReport other) { + for ( int tableIndex = 0; tableIndex < recalibrationTables.numTables(); tableIndex++ ) { + final NestedIntegerArray myTable = recalibrationTables.getTable(tableIndex); + final NestedIntegerArray otherTable = other.recalibrationTables.getTable(tableIndex); + RecalUtils.combineTables(myTable, otherTable); + } + } + + public QuantizationInfo getQuantizationInfo() { + return quantizationInfo; + } + + public RecalibrationTables getRecalibrationTables() { + return recalibrationTables; + } + + public Covariate[] getRequestedCovariates() { + return requestedCovariates; + } + + /** + * Initialize read group keys using the shared list of all the read groups. + * + * By using the same sorted set of read groups across all recalibration reports, even if + * one report is missing a read group, all the reports use the same read group keys. + * + * @param allReadGroups The list of all possible read groups + */ + private void initializeReadGroupCovariates(final SortedSet allReadGroups) { + for (String readGroup: allReadGroups) { + requestedCovariates[0].keyFromValue(readGroup); + } + } + + /** + * Compiles the list of keys for the Covariates table and uses the shared parsing utility to produce the actual table + * + * @param reportTable the GATKReport table containing data for this table + * @param recalibrationTables the recalibration tables +\ */ + private void parseAllCovariatesTable(final GATKReportTable reportTable, final RecalibrationTables recalibrationTables) { + for ( int i = 0; i < reportTable.getNumRows(); i++ ) { + final Object rg = reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME); + tempCOVarray[0] = requestedCovariates[0].keyFromValue(rg); + final Object qual = reportTable.get(i, RecalUtils.QUALITY_SCORE_COLUMN_NAME); + tempCOVarray[1] = requestedCovariates[1].keyFromValue(qual); + + final String covName = (String)reportTable.get(i, RecalUtils.COVARIATE_NAME_COLUMN_NAME); + final int covIndex = optionalCovariateIndexes.get(covName); + final Object covValue = reportTable.get(i, RecalUtils.COVARIATE_VALUE_COLUMN_NAME); + tempCOVarray[2] = requestedCovariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + covIndex].keyFromValue(covValue); + + final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); + tempCOVarray[3] = event.ordinal(); + + recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + covIndex).put(getRecalDatum(reportTable, i, false), tempCOVarray); + } + } + + /** + * + * Compiles the list of keys for the QualityScore table and uses the shared parsing utility to produce the actual table + * @param reportTable the GATKReport table containing data for this table + * @param qualTable the map representing this table + */ + private void parseQualityScoreTable(final GATKReportTable reportTable, final NestedIntegerArray qualTable) { + for ( int i = 0; i < reportTable.getNumRows(); i++ ) { + final Object rg = reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME); + tempQUALarray[0] = requestedCovariates[0].keyFromValue(rg); + final Object qual = reportTable.get(i, RecalUtils.QUALITY_SCORE_COLUMN_NAME); + tempQUALarray[1] = requestedCovariates[1].keyFromValue(qual); + final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); + tempQUALarray[2] = event.ordinal(); + + qualTable.put(getRecalDatum(reportTable, i, false), tempQUALarray); + } + } + + /** + * Compiles the list of keys for the ReadGroup table and uses the shared parsing utility to produce the actual table + * + * @param reportTable the GATKReport table containing data for this table + * @param rgTable the map representing this table + */ + private void parseReadGroupTable(final GATKReportTable reportTable, final NestedIntegerArray rgTable) { + for ( int i = 0; i < reportTable.getNumRows(); i++ ) { + final Object rg = reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME); + tempRGarray[0] = requestedCovariates[0].keyFromValue(rg); + final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); + tempRGarray[1] = event.ordinal(); + + rgTable.put(getRecalDatum(reportTable, i, true), tempRGarray); + } + } + + private double asDouble(final Object o) { + if ( o instanceof Double ) + return (Double)o; + else if ( o instanceof Integer ) + return (Integer)o; + else if ( o instanceof Long ) + return (Long)o; + else + throw new ReviewedGATKException("Object " + o + " is expected to be either a double, long or integer but it's not either: " + o.getClass()); + } + + private long asLong(final Object o) { + if ( o instanceof Long ) + return (Long)o; + else if ( o instanceof Integer ) + return ((Integer)o).longValue(); + else if ( o instanceof Double ) + return ((Double)o).longValue(); + else + throw new ReviewedGATKException("Object " + o + " is expected to be a long but it's not: " + o.getClass()); + } + + private RecalDatum getRecalDatum(final GATKReportTable reportTable, final int row, final boolean hasEstimatedQReportedColumn) { + final long nObservations = asLong(reportTable.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME)); + final double nErrors = asDouble(reportTable.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME)); + //final double empiricalQuality = asDouble(reportTable.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME)); + + // the estimatedQreported column only exists in the ReadGroup table + final double estimatedQReported = hasEstimatedQReportedColumn ? + (Double) reportTable.get(row, RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME) : // we get it if we are in the read group table + Byte.parseByte((String) reportTable.get(row, RecalUtils.QUALITY_SCORE_COLUMN_NAME)); // or we use the reported quality if we are in any other table + + final RecalDatum datum = new RecalDatum(nObservations, nErrors, (byte)1); + datum.setEstimatedQReported(estimatedQReported); + //datum.setEmpiricalQuality(empiricalQuality); // don't set the value here because we will want to recompute with a different conditional Q score prior value + return datum; + } + + /** + * Parses the quantization table from the GATK Report and turns it into a map of original => quantized quality scores + * + * @param table the GATKReportTable containing the quantization mappings + * @return an ArrayList with the quantization mappings from 0 to MAX_SAM_QUAL_SCORE + */ + private QuantizationInfo initializeQuantizationTable(GATKReportTable table) { + final Byte[] quals = new Byte[QualityUtils.MAX_SAM_QUAL_SCORE + 1]; + final Long[] counts = new Long[QualityUtils.MAX_SAM_QUAL_SCORE + 1]; + for ( int i = 0; i < table.getNumRows(); i++ ) { + final byte originalQual = (byte)i; + final Object quantizedObject = table.get(i, RecalUtils.QUANTIZED_VALUE_COLUMN_NAME); + final Object countObject = table.get(i, RecalUtils.QUANTIZED_COUNT_COLUMN_NAME); + final byte quantizedQual = Byte.parseByte(quantizedObject.toString()); + final long quantizedCount = Long.parseLong(countObject.toString()); + quals[originalQual] = quantizedQual; + counts[originalQual] = quantizedCount; + } + return new QuantizationInfo(Arrays.asList(quals), Arrays.asList(counts)); + } + + /** + * Parses the arguments table from the GATK Report and creates a RAC object with the proper initialization values + * + * @param table the GATKReportTable containing the arguments and its corresponding values + * @return a RAC object properly initialized with all the objects in the table + */ + private RecalibrationArgumentCollection initializeArgumentCollectionTable(GATKReportTable table) { + final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + + for ( int i = 0; i < table.getNumRows(); i++ ) { + final String argument = table.get(i, "Argument").toString(); + Object value = table.get(i, RecalUtils.ARGUMENT_VALUE_COLUMN_NAME); + if (value.equals("null")) + value = null; // generic translation of null values that were printed out as strings | todo -- add this capability to the GATKReport + + if (argument.equals("covariate") && value != null) + RAC.COVARIATES = value.toString().split(","); + + else if (argument.equals("standard_covs")) + RAC.DO_NOT_USE_STANDARD_COVARIATES = Boolean.parseBoolean((String) value); + + else if (argument.equals("solid_recal_mode")) + RAC.SOLID_RECAL_MODE = RecalUtils.SOLID_RECAL_MODE.recalModeFromString((String) value); + + else if (argument.equals("solid_nocall_strategy")) + RAC.SOLID_NOCALL_STRATEGY = RecalUtils.SOLID_NOCALL_STRATEGY.nocallStrategyFromString((String) value); + + else if (argument.equals("mismatches_context_size")) + RAC.MISMATCHES_CONTEXT_SIZE = Integer.parseInt((String) value); + + else if (argument.equals("indels_context_size")) + RAC.INDELS_CONTEXT_SIZE = Integer.parseInt((String) value); + + else if (argument.equals("mismatches_default_quality")) + RAC.MISMATCHES_DEFAULT_QUALITY = Byte.parseByte((String) value); + + else if (argument.equals("insertions_default_quality")) + RAC.INSERTIONS_DEFAULT_QUALITY = Byte.parseByte((String) value); + + else if (argument.equals("deletions_default_quality")) + RAC.DELETIONS_DEFAULT_QUALITY = Byte.parseByte((String) value); + + else if (argument.equals("maximum_cycle_value")) + RAC.MAXIMUM_CYCLE_VALUE = Integer.parseInt((String) value); + + else if (argument.equals("low_quality_tail")) + RAC.LOW_QUAL_TAIL = Byte.parseByte((String) value); + + else if (argument.equals("default_platform")) + RAC.DEFAULT_PLATFORM = (String) value; + + else if (argument.equals("force_platform")) + RAC.FORCE_PLATFORM = (String) value; + + else if (argument.equals("quantizing_levels")) + RAC.QUANTIZING_LEVELS = Integer.parseInt((String) value); + + else if (argument.equals("recalibration_report")) + RAC.existingRecalibrationReport = (value == null) ? null : new File((String) value); + + else if (argument.equals("binary_tag_name")) + RAC.BINARY_TAG_NAME = (value == null) ? null : (String) value; + + else if (argument.equals("sort_by_all_columns")) + RAC.SORT_BY_ALL_COLUMNS = Boolean.parseBoolean((String) value); + } + + return RAC; + } + + /** + * this functionality avoids recalculating the empirical qualities, estimated reported quality + * and quantization of the quality scores during every call of combine(). Very useful for the BQSRGatherer. + */ + public void calculateQuantizedQualities() { + quantizationInfo = new QuantizationInfo(recalibrationTables, RAC.QUANTIZING_LEVELS); + } + + /** + * Creates the recalibration report. Report can then be written to a stream via GATKReport.print(PrintStream). + * + * @return newly created recalibration report + */ + public GATKReport createGATKReport() { + return RecalUtils.createRecalibrationGATKReport(argumentTable, quantizationInfo, recalibrationTables, requestedCovariates, RAC.SORT_BY_ALL_COLUMNS); + } + + public RecalibrationArgumentCollection getRAC() { + return RAC; + } + + /** + * + * @deprecated use {@link #getRequestedCovariates()} instead. + */ + @Deprecated + public Covariate[] getCovariates() { + return requestedCovariates; + } + + /** + * @return true if the report has no data + */ + public boolean isEmpty() { + return recalibrationTables.isEmpty(); + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationTables.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationTables.java new file mode 100644 index 000000000..ad227f9bd --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationTables.java @@ -0,0 +1,169 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import com.google.java.contract.Ensures; +import org.broadinstitute.gatk.utils.collections.LoggingNestedIntegerArray; +import org.broadinstitute.gatk.utils.recalibration.EventType; +import org.broadinstitute.gatk.engine.recalibration.covariates.Covariate; +import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; + +import java.io.PrintStream; +import java.util.ArrayList; + +/** + * Utility class to facilitate on-the-fly base quality score recalibration. + * + * User: ebanks + * Date: 6/20/12 + */ + +public final class RecalibrationTables { + public enum TableType { + READ_GROUP_TABLE, + QUALITY_SCORE_TABLE, + OPTIONAL_COVARIATE_TABLES_START; + } + + private final ArrayList> tables; + private final int qualDimension; + private final int eventDimension = EventType.values().length; + private final int numReadGroups; + private final PrintStream log; + + public RecalibrationTables(final Covariate[] covariates) { + this(covariates, covariates[TableType.READ_GROUP_TABLE.ordinal()].maximumKeyValue() + 1, null); + } + + public RecalibrationTables(final Covariate[] covariates, final int numReadGroups) { + this(covariates, numReadGroups, null); + } + + public RecalibrationTables(final Covariate[] covariates, final int numReadGroups, final PrintStream log) { + tables = new ArrayList>(covariates.length); + for ( int i = 0; i < covariates.length; i++ ) + tables.add(i, null); // initialize so we can set below + + qualDimension = covariates[TableType.QUALITY_SCORE_TABLE.ordinal()].maximumKeyValue() + 1; + this.numReadGroups = numReadGroups; + this.log = log; + + tables.set(TableType.READ_GROUP_TABLE.ordinal(), + log == null ? new NestedIntegerArray(numReadGroups, eventDimension) : + new LoggingNestedIntegerArray(log, "READ_GROUP_TABLE", numReadGroups, eventDimension)); + + tables.set(TableType.QUALITY_SCORE_TABLE.ordinal(), makeQualityScoreTable()); + + for (int i = TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal(); i < covariates.length; i++) + tables.set(i, + log == null ? new NestedIntegerArray(numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension) : + new LoggingNestedIntegerArray(log, String.format("OPTIONAL_COVARIATE_TABLE_%d", i - TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + 1), + numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension)); + } + + @Ensures("result != null") + public NestedIntegerArray getReadGroupTable() { + return getTable(TableType.READ_GROUP_TABLE.ordinal()); + } + + @Ensures("result != null") + public NestedIntegerArray getQualityScoreTable() { + return getTable(TableType.QUALITY_SCORE_TABLE.ordinal()); + } + + @Ensures("result != null") + public NestedIntegerArray getTable(final int index) { + return tables.get(index); + } + + @Ensures("result >= 0") + public int numTables() { + return tables.size(); + } + + /** + * @return true if all the tables contain no RecalDatums + */ + public boolean isEmpty() { + for( final NestedIntegerArray table : tables ) { + if( !table.getAllValues().isEmpty() ) { return false; } + } + return true; + } + + /** + * Allocate a new quality score table, based on requested parameters + * in this set of tables, without any data in it. The return result + * of this table is suitable for acting as a thread-local cache + * for quality score values + * @return a newly allocated, empty read group x quality score table + */ + public NestedIntegerArray makeQualityScoreTable() { + return log == null + ? new NestedIntegerArray(numReadGroups, qualDimension, eventDimension) + : new LoggingNestedIntegerArray(log, "QUALITY_SCORE_TABLE", numReadGroups, qualDimension, eventDimension); + } + + /** + * Merge all of the tables from toMerge into into this set of tables + */ + public void combine(final RecalibrationTables toMerge) { + if ( numTables() != toMerge.numTables() ) + throw new IllegalArgumentException("Attempting to merge RecalibrationTables with different sizes"); + + for ( int i = 0; i < numTables(); i++ ) { + final NestedIntegerArray myTable = this.getTable(i); + final NestedIntegerArray otherTable = toMerge.getTable(i); + RecalUtils.combineTables(myTable, otherTable); + } + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/ContextCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/ContextCovariate.java new file mode 100644 index 000000000..f1ef944dc --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/ContextCovariate.java @@ -0,0 +1,304 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration.covariates; + +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.engine.recalibration.RecalibrationArgumentCollection; +import org.broadinstitute.gatk.utils.BaseUtils; +import org.broadinstitute.gatk.utils.clipping.ClippingRepresentation; +import org.broadinstitute.gatk.utils.clipping.ReadClipper; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.engine.recalibration.ReadCovariates; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.util.ArrayList; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: 9/26/11 + */ + +public class ContextCovariate implements StandardCovariate { + private final static Logger logger = Logger.getLogger(ContextCovariate.class); + + + + private int mismatchesContextSize; + private int indelsContextSize; + + private int mismatchesKeyMask; + private int indelsKeyMask; + + private static final int LENGTH_BITS = 4; + private static final int LENGTH_MASK = 15; + + // the maximum context size (number of bases) permitted; we need to keep the leftmost base free so that values are + // not negative and we reserve 4 more bits to represent the length of the context; it takes 2 bits to encode one base. + static final private int MAX_DNA_CONTEXT = 13; + private byte LOW_QUAL_TAIL; + + // Initialize any member variables using the command-line arguments passed to the walkers + @Override + public void initialize(final RecalibrationArgumentCollection RAC) { + mismatchesContextSize = RAC.MISMATCHES_CONTEXT_SIZE; + indelsContextSize = RAC.INDELS_CONTEXT_SIZE; + + logger.info("\t\tContext sizes: base substitution model " + mismatchesContextSize + ", indel substitution model " + indelsContextSize); + + if (mismatchesContextSize > MAX_DNA_CONTEXT) + throw new UserException.BadArgumentValue("mismatches_context_size", String.format("context size cannot be bigger than %d, but was %d", MAX_DNA_CONTEXT, mismatchesContextSize)); + if (indelsContextSize > MAX_DNA_CONTEXT) + throw new UserException.BadArgumentValue("indels_context_size", String.format("context size cannot be bigger than %d, but was %d", MAX_DNA_CONTEXT, indelsContextSize)); + + LOW_QUAL_TAIL = RAC.LOW_QUAL_TAIL; + + if (mismatchesContextSize <= 0 || indelsContextSize <= 0) + throw new UserException(String.format("Context size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Indels: %d", mismatchesContextSize, indelsContextSize)); + + mismatchesKeyMask = createMask(mismatchesContextSize); + indelsKeyMask = createMask(indelsContextSize); + } + + @Override + public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { + + // store the original bases and then write Ns over low quality ones + final byte[] originalBases = read.getReadBases().clone(); + // Write N's over the low quality tail of the reads to avoid adding them into the context + final GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); + + final boolean negativeStrand = clippedRead.getReadNegativeStrandFlag(); + byte[] bases = clippedRead.getReadBases(); + if (negativeStrand) + bases = BaseUtils.simpleReverseComplement(bases); + + final ArrayList mismatchKeys = contextWith(bases, mismatchesContextSize, mismatchesKeyMask); + final ArrayList indelKeys = contextWith(bases, indelsContextSize, indelsKeyMask); + + final int readLength = bases.length; + + // this is necessary to ensure that we don't keep historical data in the ReadCovariates values + // since the context covariate may not span the entire set of values in read covariates + // due to the clipping of the low quality bases + if ( readLength != originalBases.length ) { + // don't both zeroing out if we are going to overwrite the whole array + for ( int i = 0; i < originalBases.length; i++ ) + // this base has been clipped off, so zero out the covariate values here + values.addCovariate(0, 0, 0, i); + } + + for (int i = 0; i < readLength; i++) { + final int readOffset = (negativeStrand ? readLength - i - 1 : i); + final int indelKey = indelKeys.get(i); + values.addCovariate(mismatchKeys.get(i), indelKey, indelKey, readOffset); + } + + // put the original bases back in + read.setReadBases(originalBases); + } + + // Used to get the covariate's value from input csv file during on-the-fly recalibration + @Override + public final Object getValue(final String str) { + return str; + } + + @Override + public String formatKey(final int key) { + if (key == -1) // this can only happen in test routines because we do not propagate null keys to the csv file + return null; + + return contextFromKey(key); + } + + @Override + public int keyFromValue(final Object value) { + return keyFromContext((String) value); + } + + private static int createMask(final int contextSize) { + int mask = 0; + // create 2*contextSize worth of bits + for (int i = 0; i < contextSize; i++) + mask = (mask << 2) | 3; + // shift 4 bits to mask out the bits used to encode the length + return mask << LENGTH_BITS; + } + + /** + * calculates the context of a base independent of the covariate mode (mismatch, insertion or deletion) + * + * @param bases the bases in the read to build the context from + * @param contextSize context size to use building the context + * @param mask mask for pulling out just the context bits + */ + private static ArrayList contextWith(final byte[] bases, final int contextSize, final int mask) { + + final int readLength = bases.length; + final ArrayList keys = new ArrayList(readLength); + + // the first contextSize-1 bases will not have enough previous context + for (int i = 1; i < contextSize && i <= readLength; i++) + keys.add(-1); + + if (readLength < contextSize) + return keys; + + final int newBaseOffset = 2 * (contextSize - 1) + LENGTH_BITS; + + // get (and add) the key for the context starting at the first base + int currentKey = keyFromContext(bases, 0, contextSize); + keys.add(currentKey); + + // if the first key was -1 then there was an N in the context; figure out how many more consecutive contexts it affects + int currentNPenalty = 0; + if (currentKey == -1) { + currentKey = 0; + currentNPenalty = contextSize - 1; + int offset = newBaseOffset; + while (bases[currentNPenalty] != 'N') { + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(bases[currentNPenalty]); + currentKey |= (baseIndex << offset); + offset -= 2; + currentNPenalty--; + } + } + + for (int currentIndex = contextSize; currentIndex < readLength; currentIndex++) { + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(bases[currentIndex]); + if (baseIndex == -1) { // ignore non-ACGT bases + currentNPenalty = contextSize; + currentKey = 0; // reset the key + } else { + // push this base's contribution onto the key: shift everything 2 bits, mask out the non-context bits, and add the new base and the length in + currentKey = (currentKey >> 2) & mask; + currentKey |= (baseIndex << newBaseOffset); + currentKey |= contextSize; + } + + if (currentNPenalty == 0) { + keys.add(currentKey); + } else { + currentNPenalty--; + keys.add(-1); + } + } + + return keys; + } + + public static int keyFromContext(final String dna) { + return keyFromContext(dna.getBytes(), 0, dna.length()); + } + + /** + * Creates a int representation of a given dna string. + * + * @param dna the dna sequence + * @param start the start position in the byte array (inclusive) + * @param end the end position in the array (exclusive) + * @return the key representing the dna sequence + */ + private static int keyFromContext(final byte[] dna, final int start, final int end) { + + int key = end - start; + int bitOffset = LENGTH_BITS; + for (int i = start; i < end; i++) { + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(dna[i]); + if (baseIndex == -1) // ignore non-ACGT bases + return -1; + key |= (baseIndex << bitOffset); + bitOffset += 2; + } + return key; + } + + /** + * Converts a key into the dna string representation. + * + * @param key the key representing the dna sequence + * @return the dna sequence represented by the key + */ + public static String contextFromKey(final int key) { + if (key < 0) + throw new ReviewedGATKException("dna conversion cannot handle negative numbers. Possible overflow?"); + + final int length = key & LENGTH_MASK; // the first bits represent the length (in bp) of the context + int mask = 48; // use the mask to pull out bases + int offset = LENGTH_BITS; + + StringBuilder dna = new StringBuilder(); + for (int i = 0; i < length; i++) { + final int baseIndex = (key & mask) >> offset; + dna.append((char)BaseUtils.baseIndexToSimpleBase(baseIndex)); + mask = mask << 2; // move the mask over to the next 2 bits + offset += 2; + } + + return dna.toString(); + } + + @Override + public int maximumKeyValue() { + // the maximum value is T (11 in binary) for each base in the context + int length = Math.max(mismatchesContextSize, indelsContextSize); // the length of the context + int key = length; + int bitOffset = LENGTH_BITS; + for (int i = 0; i MAX_CYCLE_FOR_INDELS) ? -1 : substitutionKey; + values.addCovariate(substitutionKey, indelKey, indelKey, i); + cycle += increment; + } + } + + // Flow cycle platforms + else if (ngsPlatform.getSequencerType() == SequencerFlowClass.FLOW) { + + final byte[] bases = read.getReadBases(); + + // Differentiate between first and second of pair. + // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group + // to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair. + // Therefore the cycle covariate must differentiate between first and second of pair reads. + // This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because + // the current sequential model would consider the effects independently instead of jointly. + final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag(); + + int cycle = multiplyByNegative1 ? -1 : 1; // todo -- check if this is the right behavior for mate paired reads in flow cycle platforms. + + // BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change + // For example, AAAAAAA was probably read in two flow cycles but here we count it as one + if (!read.getReadNegativeStrandFlag()) { // Forward direction + int iii = 0; + while (iii < readLength) { + while (iii < readLength && bases[iii] == (byte) 'T') { + final int key = keyFromCycle(cycle); + values.addCovariate(key, key, key, iii); + iii++; + } + while (iii < readLength && bases[iii] == (byte) 'A') { + final int key = keyFromCycle(cycle); + values.addCovariate(key, key, key, iii); + iii++; + } + while (iii < readLength && bases[iii] == (byte) 'C') { + final int key = keyFromCycle(cycle); + values.addCovariate(key, key, key, iii); + iii++; + } + while (iii < readLength && bases[iii] == (byte) 'G') { + final int key = keyFromCycle(cycle); + values.addCovariate(key, key, key, iii); + iii++; + } + if (iii < readLength) { + if (multiplyByNegative1) + cycle--; + else + cycle++; + } + if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) { + final int key = keyFromCycle(cycle); + values.addCovariate(key, key, key, iii); + iii++; + } + + } + } + else { // Negative direction + int iii = readLength - 1; + while (iii >= 0) { + while (iii >= 0 && bases[iii] == (byte) 'T') { + final int key = keyFromCycle(cycle); + values.addCovariate(key, key, key, iii); + iii--; + } + while (iii >= 0 && bases[iii] == (byte) 'A') { + final int key = keyFromCycle(cycle); + values.addCovariate(key, key, key, iii); + iii--; + } + while (iii >= 0 && bases[iii] == (byte) 'C') { + final int key = keyFromCycle(cycle); + values.addCovariate(key, key, key, iii); + iii--; + } + while (iii >= 0 && bases[iii] == (byte) 'G') { + final int key = keyFromCycle(cycle); + values.addCovariate(key, key, key, iii); + iii--; + } + if (iii >= 0) { + if (multiplyByNegative1) + cycle--; + else + cycle++; + } + if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) { + final int key = keyFromCycle(cycle); + values.addCovariate(key, key, key, iii); + iii--; + } + } + } + } + + // Unknown platforms + else { + throw new UserException("The platform (" + read.getReadGroup().getPlatform() + + ") associated with read group " + read.getReadGroup() + + " is not a recognized platform. Allowable options are " + NGSPlatform.knownPlatformsString()); + } + } + + // Used to get the covariate's value from input csv file during on-the-fly recalibration + @Override + public final Object getValue(final String str) { + return Integer.parseInt(str); + } + + @Override + public String formatKey(final int key) { + int cycle = key >> 1; // shift so we can remove the "sign" bit + if ( (key & 1) != 0 ) // is the last bit set? + cycle *= -1; // then the cycle is negative + return String.format("%d", cycle); + } + + @Override + public int keyFromValue(final Object value) { + return (value instanceof String) ? keyFromCycle(Integer.parseInt((String) value)) : keyFromCycle((Integer) value); + } + + @Override + public int maximumKeyValue() { + return (MAXIMUM_CYCLE_VALUE << 1) + 1; + } + + private int keyFromCycle(final int cycle) { + // no negative values because values must fit into the first few bits of the long + int result = Math.abs(cycle); + if ( result > MAXIMUM_CYCLE_VALUE ) + throw new UserException("The maximum allowed value for the cycle is " + MAXIMUM_CYCLE_VALUE + ", but a larger cycle (" + result + ") was detected. Please use the --maximum_cycle_value argument to increase this value (at the expense of requiring more memory to run)"); + + result = result << 1; // shift so we can add the "sign" bit + if ( cycle < 0 ) + result++; // negative cycles get the lower-most bit set + return result; + } +} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/ExperimentalCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/ExperimentalCovariate.java new file mode 100644 index 000000000..c276f43ec --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/ExperimentalCovariate.java @@ -0,0 +1,81 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration.covariates; + +/** + * [Short one sentence description of this walker] + *

+ *

+ * [Functionality of this walker] + *

+ *

+ *

Input

+ *

+ * [Input description] + *

+ *

+ *

Output

+ *

+ * [Output description] + *

+ *

+ *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T $WalkerName
+ *  
+ * + * @author Your Name + * @since Date created + */ +public interface ExperimentalCovariate extends Covariate {} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/QualityScoreCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/QualityScoreCovariate.java new file mode 100644 index 000000000..889e00b9a --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/QualityScoreCovariate.java @@ -0,0 +1,129 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration.covariates; + +import org.broadinstitute.gatk.engine.recalibration.ReadCovariates; +import org.broadinstitute.gatk.engine.recalibration.RecalibrationArgumentCollection; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 3, 2009 + * + * The Reported Quality Score covariate. + */ + +public class QualityScoreCovariate implements RequiredCovariate { + + // Initialize any member variables using the command-line arguments passed to the walkers + @Override + public void initialize(final RecalibrationArgumentCollection RAC) {} + + @Override + public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { + final byte[] baseQualities = read.getBaseQualities(); + final byte[] baseInsertionQualities = read.getBaseInsertionQualities(); + final byte[] baseDeletionQualities = read.getBaseDeletionQualities(); + + for (int i = 0; i < baseQualities.length; i++) { + values.addCovariate((int)baseQualities[i], (int)baseInsertionQualities[i], (int)baseDeletionQualities[i], i); + } + } + + // Used to get the covariate's value from input csv file during on-the-fly recalibration + @Override + public final Object getValue(final String str) { + return Byte.parseByte(str); + } + + @Override + public String formatKey(final int key) { + return String.format("%d", key); + } + + @Override + public int keyFromValue(final Object value) { + return (value instanceof String) ? (int)Byte.parseByte((String) value) : (int)(Byte) value; + } + + @Override + public int maximumKeyValue() { + return QualityUtils.MAX_SAM_QUAL_SCORE; + } +} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/ReadGroupCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/ReadGroupCovariate.java new file mode 100644 index 000000000..9f4c34463 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/ReadGroupCovariate.java @@ -0,0 +1,190 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration.covariates; + +import org.broadinstitute.gatk.engine.recalibration.RecalibrationArgumentCollection; +import org.broadinstitute.gatk.engine.recalibration.ReadCovariates; +import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Oct 30, 2009 + * + * The Read Group covariate. + */ + +public class ReadGroupCovariate implements RequiredCovariate { + + private final HashMap readGroupLookupTable = new HashMap(); + private final HashMap readGroupReverseLookupTable = new HashMap(); + private int nextId = 0; + private String forceReadGroup; + + // Initialize any member variables using the command-line arguments passed to the walkers + @Override + public void initialize(final RecalibrationArgumentCollection RAC) { + forceReadGroup = RAC.FORCE_READGROUP; + } + + @Override + public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { + final String readGroupId = readGroupValueFromRG(read.getReadGroup()); + final int key = keyForReadGroup(readGroupId); + + final int l = read.getReadLength(); + for (int i = 0; i < l; i++) + values.addCovariate(key, key, key, i); + } + + @Override + public final Object getValue(final String str) { + return str; + } + + @Override + public synchronized String formatKey(final int key) { + // This method is synchronized so that we don't attempt to do a get() + // from the reverse lookup table while that table is being updated + return readGroupReverseLookupTable.get(key); + } + + @Override + public int keyFromValue(final Object value) { + return keyForReadGroup((String) value); + } + + /** + * Get the mapping from read group names to integer key values for all read groups in this covariate + * @return a set of mappings from read group names -> integer key values + */ + public Set> getKeyMap() { + return readGroupLookupTable.entrySet(); + } + + private int keyForReadGroup(final String readGroupId) { + // Rather than synchronize this entire method (which would be VERY expensive for walkers like the BQSR), + // synchronize only the table updates. + + // Before entering the synchronized block, check to see if this read group is not in our tables. + // If it's not, either we will have to insert it, OR another thread will insert it first. + // This preliminary check avoids doing any synchronization most of the time. + if ( ! readGroupLookupTable.containsKey(readGroupId) ) { + + synchronized ( this ) { + + // Now we need to make sure the key is STILL not there, since another thread may have come along + // and inserted it while we were waiting to enter this synchronized block! + if ( ! readGroupLookupTable.containsKey(readGroupId) ) { + readGroupLookupTable.put(readGroupId, nextId); + readGroupReverseLookupTable.put(nextId, readGroupId); + nextId++; + } + } + } + + return readGroupLookupTable.get(readGroupId); + } + + @Override + public synchronized int maximumKeyValue() { + // Synchronized so that we don't query table size while the tables are being updated + return readGroupLookupTable.size() - 1; + } + + /** + * If the sample has a PU tag annotation, return that. If not, return the read group id. + * + * @param rg the read group record + * @return platform unit or readgroup id + */ + private String readGroupValueFromRG(final GATKSAMReadGroupRecord rg) { + if ( forceReadGroup != null ) + return forceReadGroup; + + final String platformUnit = rg.getPlatformUnit(); + return platformUnit == null ? rg.getId() : platformUnit; + } + +} + + diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatCovariate.java new file mode 100644 index 000000000..64b32d766 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatCovariate.java @@ -0,0 +1,285 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration.covariates; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.gatk.engine.recalibration.RecalibrationArgumentCollection; +import org.broadinstitute.gatk.engine.recalibration.ReadCovariates; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.gatk.utils.BaseUtils; +import org.broadinstitute.gatk.utils.collections.Pair; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +public abstract class RepeatCovariate implements ExperimentalCovariate { + protected int MAX_REPEAT_LENGTH; + protected int MAX_STR_UNIT_LENGTH; + private final HashMap repeatLookupTable = new HashMap(); + private final HashMap repeatReverseLookupTable = new HashMap(); + private int nextId = 0; + + // Initialize any member variables using the command-line arguments passed to the walkers + @Override + public void initialize(final RecalibrationArgumentCollection RAC) { + MAX_STR_UNIT_LENGTH = RAC.MAX_STR_UNIT_LENGTH; + MAX_REPEAT_LENGTH = RAC.MAX_REPEAT_LENGTH; + } + + public void initialize(final int MAX_STR_UNIT_LENGTH, final int MAX_REPEAT_LENGTH) { + this.MAX_STR_UNIT_LENGTH = MAX_STR_UNIT_LENGTH; + this.MAX_REPEAT_LENGTH = MAX_REPEAT_LENGTH; + } + + @Override + public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { + // store the original bases and then write Ns over low quality ones + final byte[] originalBases = read.getReadBases().clone(); + + final boolean negativeStrand = read.getReadNegativeStrandFlag(); + byte[] bases = read.getReadBases(); + if (negativeStrand) + bases = BaseUtils.simpleReverseComplement(bases); + + // don't record reads with N's + if (!BaseUtils.isAllRegularBases(bases)) + return; + + for (int i = 0; i < bases.length; i++) { + final Pair res = findTandemRepeatUnits(bases, i); + // to merge repeat unit and repeat length to get covariate value: + final String repeatID = getCovariateValueFromUnitAndLength(res.first, res.second); + final int key = keyForRepeat(repeatID); + + final int readOffset = (negativeStrand ? bases.length - i - 1 : i); + values.addCovariate(key, key, key, readOffset); + } + + // put the original bases back in + read.setReadBases(originalBases); + + } + + public Pair findTandemRepeatUnits(byte[] readBases, int offset) { + int maxBW = 0; + byte[] bestBWRepeatUnit = new byte[]{readBases[offset]}; + for (int str = 1; str <= MAX_STR_UNIT_LENGTH; str++) { + // fix repeat unit length + //edge case: if candidate tandem repeat unit falls beyond edge of read, skip + if (offset+1-str < 0) + break; + + // get backward repeat unit and # repeats + byte[] backwardRepeatUnit = Arrays.copyOfRange(readBases, offset - str + 1, offset + 1); + maxBW = GATKVariantContextUtils.findNumberOfRepetitions(backwardRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false); + if (maxBW > 1) { + bestBWRepeatUnit = backwardRepeatUnit.clone(); + break; + } + } + byte[] bestRepeatUnit = bestBWRepeatUnit; + int maxRL = maxBW; + + if (offset < readBases.length-1) { + byte[] bestFWRepeatUnit = new byte[]{readBases[offset+1]}; + int maxFW = 0; + for (int str = 1; str <= MAX_STR_UNIT_LENGTH; str++) { + // fix repeat unit length + //edge case: if candidate tandem repeat unit falls beyond edge of read, skip + if (offset+str+1 > readBases.length) + break; + + // get forward repeat unit and # repeats + byte[] forwardRepeatUnit = Arrays.copyOfRange(readBases, offset +1, offset+str+1); + maxFW = GATKVariantContextUtils.findNumberOfRepetitions(forwardRepeatUnit, Arrays.copyOfRange(readBases, offset + 1, readBases.length), true); + if (maxFW > 1) { + bestFWRepeatUnit = forwardRepeatUnit.clone(); + break; + } + } + // if FW repeat unit = BW repeat unit it means we're in the middle of a tandem repeat - add FW and BW components + if (Arrays.equals(bestFWRepeatUnit, bestBWRepeatUnit)) { + maxRL = maxBW + maxFW; + bestRepeatUnit = bestFWRepeatUnit; // arbitrary + } + else { + // tandem repeat starting forward from current offset. + // It could be the case that best BW unit was differnet from FW unit, but that BW still contains FW unit. + // For example, TTCTT(C) CCC - at (C) place, best BW unit is (TTC)2, best FW unit is (C)3. + // but correct representation at that place might be (C)4. + // Hence, if the FW and BW units don't match, check if BW unit can still be a part of FW unit and add + // representations to total + maxBW = GATKVariantContextUtils.findNumberOfRepetitions(bestFWRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false); + maxRL = maxFW + maxBW; + bestRepeatUnit = bestFWRepeatUnit; + + } + + } + + + + if(maxRL > MAX_REPEAT_LENGTH) { maxRL = MAX_REPEAT_LENGTH; } + return new Pair(bestRepeatUnit, maxRL); + + } + @Override + public final Object getValue(final String str) { + return str; + } + + @Override + public synchronized String formatKey(final int key) { + // This method is synchronized so that we don't attempt to do a get() + // from the reverse lookup table while that table is being updated + return repeatReverseLookupTable.get(key); + } + + @Requires({"repeatLength>=0", "repeatFromUnitAndLength != null"}) + @Ensures("result != null") + protected abstract String getCovariateValueFromUnitAndLength(final byte[] repeatFromUnitAndLength, final int repeatLength); + + + @Override + public int keyFromValue(final Object value) { + return keyForRepeat((String) value); + } + + /** + * Get the mapping from read group names to integer key values for all read groups in this covariate + * @return a set of mappings from read group names -> integer key values + */ + public Set> getKeyMap() { + return repeatLookupTable.entrySet(); + } + + private int keyForRepeat(final String repeatID) { + // Rather than synchronize this entire method (which would be VERY expensive for walkers like the BQSR), + // synchronize only the table updates. + + // Before entering the synchronized block, check to see if this read group is not in our tables. + // If it's not, either we will have to insert it, OR another thread will insert it first. + // This preliminary check avoids doing any synchronization most of the time. + if ( ! repeatLookupTable.containsKey(repeatID) ) { + + synchronized ( this ) { + + // Now we need to make sure the key is STILL not there, since another thread may have come along + // and inserted it while we were waiting to enter this synchronized block! + if ( ! repeatLookupTable.containsKey(repeatID) ) { + repeatLookupTable.put(repeatID, nextId); + repeatReverseLookupTable.put(nextId, repeatID); + nextId++; + } + } + } + + return repeatLookupTable.get(repeatID); + } + + + /** + * Splits repeat unit and num repetitions from covariate value. + * For example, if value if "ATG4" it returns (ATG,4) + * @param value Covariate value + * @return Split pair + */ + @Requires("value != null") + @Ensures({"result.first != null","result.second>=0"}) + public static Pair getRUandNRfromCovariate(final String value) { + + int k = 0; + for ( k=0; k < value.length(); k++ ) { + if (!BaseUtils.isRegularBase(value.getBytes()[k])) + break; + } + Integer nr = Integer.valueOf(value.substring(k,value.length())); // will throw NumberFormatException if format illegal + if (k == value.length() || nr <= 0) + throw new IllegalStateException("Covariate is not of form (Repeat Unit) + Integer"); + + return new Pair(value.substring(0,k), nr); + } + + /** + * Gets bases from tandem repeat representation (Repeat Unit),(Number of Repeats). + * For example, (AGC),3 returns AGCAGCAGC + * @param repeatUnit Tandem repeat unit + * @param numRepeats Number of repeats + * @return Expanded String + */ + @Requires({"numRepeats > 0","repeatUnit != null"}) + @Ensures("result != null") + public static String getBasesFromRUandNR(final String repeatUnit, final int numRepeats) { + final StringBuilder sb = new StringBuilder(); + + for (int i=0; i < numRepeats; i++) + sb.append(repeatUnit); + + return sb.toString(); + } + + // version given covariate key + public static String getBasesFromRUandNR(final String covariateValue) { + Pair pair = getRUandNRfromCovariate(covariateValue); + return getBasesFromRUandNR(pair.getFirst(), pair.getSecond()); + } + + @Override + public abstract int maximumKeyValue(); + + + +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatLengthCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatLengthCovariate.java new file mode 100644 index 000000000..fb6aeaf85 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatLengthCovariate.java @@ -0,0 +1,74 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration.covariates; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +public class RepeatLengthCovariate extends RepeatCovariate { + + @Requires({"repeatLength>=0", "repeatFromUnitAndLength != null"}) + @Ensures("result != null") + protected String getCovariateValueFromUnitAndLength(final byte[] repeatFromUnitAndLength, final int repeatLength) { + return String.format("%d",repeatLength); + } + + @Override + public synchronized int maximumKeyValue() { + // Synchronized so that we don't query table size while the tables are being updated + //return repeatLookupTable.size() - 1; + // max possible values of covariate: for repeat unit, length is up to MAX_STR_UNIT_LENGTH, + // so we have 4^MAX_STR_UNIT_LENGTH * MAX_REPEAT_LENGTH possible values + return (1+MAX_REPEAT_LENGTH); + } + +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatUnitAndLengthCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatUnitAndLengthCovariate.java new file mode 100644 index 000000000..10a7f6672 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatUnitAndLengthCovariate.java @@ -0,0 +1,75 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration.covariates; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + + +public class RepeatUnitAndLengthCovariate extends RepeatCovariate { + + @Requires({"repeatLength>=0", "repeatFromUnitAndLength != null"}) + @Ensures("result != null") + protected String getCovariateValueFromUnitAndLength(final byte[] repeatFromUnitAndLength, final int repeatLength) { + return new String(repeatFromUnitAndLength) + String.format("%d",repeatLength); + } + + @Override + public synchronized int maximumKeyValue() { + // Synchronized so that we don't query table size while the tables are being updated + //return repeatLookupTable.size() - 1; + // max possible values of covariate: for repeat unit, length is up to MAX_STR_UNIT_LENGTH, + // so we have 4^MAX_STR_UNIT_LENGTH * MAX_REPEAT_LENGTH possible values + return (1<<(2*MAX_STR_UNIT_LENGTH)) * MAX_REPEAT_LENGTH +1; + } + +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatUnitCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatUnitCovariate.java new file mode 100644 index 000000000..d961b1460 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatUnitCovariate.java @@ -0,0 +1,78 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration.covariates; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 11/3/12 + */ + +public class RepeatUnitCovariate extends RepeatCovariate { + + protected String getCovariateValueFromUnitAndLength(final byte[] repeatFromUnitAndLength, final int repeatLength) { + return new String(repeatFromUnitAndLength); + + } + + + @Override + public synchronized int maximumKeyValue() { + // Synchronized so that we don't query table size while the tables are being updated + //return repeatLookupTable.size() - 1; + // max possible values of covariate: for repeat unit, length is up to MAX_STR_UNIT_LENGTH, + // so we have 4^MAX_STR_UNIT_LENGTH * MAX_REPEAT_LENGTH possible values + return (1<<(2*MAX_STR_UNIT_LENGTH)) +1; + } + + +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RequiredCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RequiredCovariate.java new file mode 100644 index 000000000..8f2155ff2 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RequiredCovariate.java @@ -0,0 +1,81 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration.covariates; + +/** + * [Short one sentence description of this walker] + *

+ *

+ * [Functionality of this walker] + *

+ *

+ *

Input

+ *

+ * [Input description] + *

+ *

+ *

Output

+ *

+ * [Output description] + *

+ *

+ *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T $WalkerName
+ *  
+ * + * @author Your Name + * @since Date created + */ +public interface RequiredCovariate extends Covariate {} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/StandardCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/StandardCovariate.java new file mode 100644 index 000000000..82e2bd199 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/StandardCovariate.java @@ -0,0 +1,81 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration.covariates; + +/** + * [Short one sentence description of this walker] + *

+ *

+ * [Functionality of this walker] + *

+ *

+ *

Input

+ *

+ * [Input description] + *

+ *

+ *

Output

+ *

+ * [Output description] + *

+ *

+ *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T $WalkerName
+ *  
+ * + * @author Your Name + * @since Date created + */ +public interface StandardCovariate extends Covariate {} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleCountBySample.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleCountBySample.java index 949b61ec1..f66390fc1 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleCountBySample.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleCountBySample.java @@ -54,21 +54,19 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFFormatHeaderLine; -import htsjdk.variant.vcf.VCFHeaderLineCount; -import htsjdk.variant.vcf.VCFHeaderLineType; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeAlleleCounts; import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculator; import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculators; import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.Arrays; import java.util.Collections; @@ -77,17 +75,17 @@ import java.util.List; /** * Allele count and frequency expectation per sample * - * Needs documentation + *

This annotation calculates the maximum likelihood (ML) number and frequency of alternate alleles for each individual sample at a site. In essence, it is equivalent to calculating the sum of "1"s in a genotype (for a biallelic site).

* */ @SuppressWarnings("unused") public final class AlleleCountBySample extends GenotypeAnnotation { - private final static List keyNames = Collections.unmodifiableList(Arrays.asList(VCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY,VCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY)); + private final static List keyNames = Collections.unmodifiableList(Arrays.asList(GATKVCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY,GATKVCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY)); private final static List descriptors = Collections.unmodifiableList(Arrays.asList( - new VCFFormatHeaderLine(VCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the alternate allele count, in the same order as listed, for each individual sample"), - new VCFFormatHeaderLine(VCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the alternate allele fraction, in the same order as listed, for each individual sample") + GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY), + GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY) )); @Override @@ -121,8 +119,8 @@ public final class AlleleCountBySample extends GenotypeAnnotation { AC[alleleIndex - 1] = alleleCount; AF[alleleIndex - 1] = ((double) alleleCount) / (double) ploidy; } - gb.attribute(VCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY, Utils.asList(AC)); - gb.attribute(VCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY, Utils.asList(AF)); + gb.attribute(GATKVCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY, AC); + gb.attribute(GATKVCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY, AF); } @Override diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AnnotationUtils.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AnnotationUtils.java new file mode 100644 index 000000000..aa736b2d1 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AnnotationUtils.java @@ -0,0 +1,118 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.annotator; + +import htsjdk.variant.variantcontext.Genotype; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCaller; + +public class AnnotationUtils { + + /** + * Checks if the input data is appropriate + * + * @param walker input walker + * @param map input map for each read, holds underlying alleles represented by an aligned read, and corresponding relative likelihood. + * @param g input genotype + * @param warningsLogged array that enforces the warning is logged once for each caller + * @param logger logger specific for each caller + * + * @return true if the walker is a HaplotypeCaller, the likelihood map is non-null and the genotype is non-null and called, false otherwise + * @throws ReviewedGATKException if the size of warningsLogged is less than 4. + */ + public static boolean isAppropriateInput(final AnnotatorCompatible walker, final PerReadAlleleLikelihoodMap map, final Genotype g, final boolean[] warningsLogged, final Logger logger) { + + if ( warningsLogged.length < 4 ){ + throw new ReviewedGATKException("Warnings logged array must have at last 4 elements, but has " + warningsLogged.length); + } + + if ( !(walker instanceof HaplotypeCaller) ) { + if ( !warningsLogged[0] ) { + if ( walker != null ) + logger.warn("Annotation will not be calculated, must be called from HaplotyepCaller, not " + walker.getClass().getName()); + else + logger.warn("Annotation will not be calculated, must be called from HaplotyepCaller"); + warningsLogged[0] = true; + } + return false; + } + + if ( map == null ){ + if ( !warningsLogged[1] ) { + logger.warn("Annotation will not be calculated, can only be used with likelihood based annotations in the HaplotypeCaller"); + warningsLogged[1] = true; + } + return false; + } + + if ( g == null ){ + if ( !warningsLogged[2] ) { + logger.warn("Annotation will not be calculated, missing genotype"); + warningsLogged[2]= true; + } + return false; + } + + if ( !g.isCalled() ){ + if ( !warningsLogged[3] ) { + logger.warn("Annotation will not be calculated, genotype is not called"); + warningsLogged[3] = true; + } + return false; + } + + return true; + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java index e5d0d92d6..7bdc365f1 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseQualityRankSumTest.java @@ -54,15 +54,16 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.ReadUtils; -import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.gatk.utils.pileup.PileupElement; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.*; /** - * Rank Sum Test of REF vs. ALT base quality scores + * Rank Sum Test of REF versus ALT base quality scores * *

This variant-level annotation tests compares the base qualities of the data supporting the reference allele with those supporting the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the bases supporting the alternate allele have lower quality scores than those supporting the reference allele. Conversely, a positive value indicates that the bases supporting the alternate allele have higher quality scores than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact.

* @@ -75,10 +76,10 @@ import java.util.*; */ public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnotation { @Override - public List getKeyNames() { return Arrays.asList("BaseQRankSum"); } + public List getKeyNames() { return Arrays.asList(GATKVCFConstants.BASE_QUAL_RANK_SUM_KEY); } @Override - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("BaseQRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities")); } + public List getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); } @Override protected Double getElementForRead(final GATKSAMRecord read, final int refLoc) { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ChromosomeCounts.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ChromosomeCounts.java index e1ffbb0f3..6f0bba28e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ChromosomeCounts.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ChromosomeCounts.java @@ -52,9 +52,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.Walker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; @@ -65,6 +65,7 @@ import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFInfoHeaderLine; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextUtils; +import org.broadinstitute.gatk.utils.variant.ChromosomeCountConstants; import java.util.*; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java index 660d78a79..3e70eea57 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java @@ -51,16 +51,16 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.utils.pileup.PileupElement; -import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.gatk.utils.sam.AlignmentUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.*; /** - * Rank Sum Test for hard-clipped bases on REF vs. ALT reads + * Rank Sum Test for hard-clipped bases on REF versus ALT reads * *

This variant-level annotation tests whether the data supporting the reference allele shows more or less base clipping (hard clips) than those supporting the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have more hard-clipped bases than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have fewer hard-clipped bases than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact.

* @@ -68,15 +68,15 @@ import java.util.*; *

The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test applied to base clips (number of hard-clipped bases on reads supporting REF vs. number of hard-clipped bases on reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.

* *

Caveat

- *

The clipping rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.

+ *

The clipping rank sum test cannot be calculated for sites without a mixture of reads showing both the reference and alternate alleles.

* */ public class ClippingRankSumTest extends RankSumTest { @Override - public List getKeyNames() { return Arrays.asList("ClippingRankSum"); } + public List getKeyNames() { return Arrays.asList(GATKVCFConstants.CLIPPING_RANK_SUM_KEY); } @Override - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("ClippingRankSum", 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases")); } + public List getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); } @Override protected Double getElementForRead(final GATKSAMRecord read, final int refLoc) { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/Coverage.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/Coverage.java index fad666f80..f8404800e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/Coverage.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/Coverage.java @@ -51,9 +51,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; @@ -70,7 +70,7 @@ import java.util.List; import java.util.Map; /** - * Total depth of coverage per sample (in FORMAT) and over all samples (in INFO). + * Total depth of coverage per sample and over all samples. * *

This annotation is used to provide counts of read depth at two different levels, with some important differences. At the sample level (FORMAT), the DP value is the count of reads that passed the caller's internal quality control metrics (such as MAPQ > 17, for example). At the site level (INFO), the DP value is the unfiltered depth over all samples.

* @@ -78,7 +78,7 @@ import java.util.Map; * *

Caveats

*
    - *
  • If downsampling is enabled (as is done by default for some analyses to remove excessive coverage), the depth of coverage effectively seen by the caller may be inferior to the actual depth of coverage in the original file. If using `-dcov D`, the maximum depth that can be seen for N samples will be N * D.
  • + *
  • If downsampling is enabled (as is done by default for some analyses to remove excessive coverage), the depth of coverage effectively seen by the caller may be inferior to the actual depth of coverage in the original file. If using "-dcov D", the maximum depth that can be seen for N samples will be N * D.
  • *
* *

Related annotations

diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerAlleleBySample.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerAlleleBySample.java index 6fd39555e..536f196c4 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerAlleleBySample.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerAlleleBySample.java @@ -51,9 +51,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation; @@ -90,8 +90,9 @@ import java.util.*; *

Related annotations

* */ public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerSampleHC.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerSampleHC.java index d71d58853..50a640482 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerSampleHC.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerSampleHC.java @@ -51,13 +51,14 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.apache.log4j.Logger; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; @@ -91,6 +92,11 @@ import java.util.*; * */ public class DepthPerSampleHC extends GenotypeAnnotation { + private final static Logger logger = Logger.getLogger(DepthPerSampleHC.class); + private boolean alleleLikelihoodMapSubsetWarningLogged = false; + boolean[] warningsLogged = new boolean[4]; + + @Override public void annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, final ReferenceContext ref, @@ -98,26 +104,29 @@ public class DepthPerSampleHC extends GenotypeAnnotation { final VariantContext vc, final Genotype g, final GenotypeBuilder gb, - final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { - if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) ) - return; + final PerReadAlleleLikelihoodMap alleleLikelihoodMap){ - if (alleleLikelihoodMap == null ) - throw new IllegalStateException("DepthPerSampleHC can only be used with likelihood based annotations in the HaplotypeCaller"); + if ( !AnnotationUtils.isAppropriateInput(walker, alleleLikelihoodMap, g, warningsLogged, logger) ) { + return; + } // the depth for the HC is the sum of the informative alleles at this site. It's not perfect (as we cannot // differentiate between reads that align over the event but aren't informative vs. those that aren't even // close) but it's a pretty good proxy and it matches with the AD field (i.e., sum(AD) = DP). int dp = 0; - if ( alleleLikelihoodMap.isEmpty() ) { - // there are no reads - } else { + // there are reads + if ( !alleleLikelihoodMap.isEmpty() ) { final Set alleles = new HashSet<>(vc.getAlleles()); // make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext - if ( ! alleleLikelihoodMap.getAllelesSet().containsAll(alleles) ) - throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + alleleLikelihoodMap.getAllelesSet()); + if ( !alleleLikelihoodMap.getAllelesSet().containsAll(alleles) ) { + if ( !alleleLikelihoodMapSubsetWarningLogged ) { + logger.warn("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + alleleLikelihoodMap.getAllelesSet()); + alleleLikelihoodMapSubsetWarningLogged = true; + } + return; + } for (Map.Entry> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles); @@ -130,10 +139,12 @@ public class DepthPerSampleHC extends GenotypeAnnotation { } } + @Override public List getKeyNames() { return Collections.singletonList(VCFConstants.DEPTH_KEY); } + @Override public List getDescriptions() { return Collections.singletonList(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY)); } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java index 0972038d6..eb8b17ee4 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java @@ -52,24 +52,17 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import cern.jet.math.Arithmetic; -import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.GenotypesContext; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation; -import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.QualityUtils; -import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.gatk.utils.pileup.PileupElement; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.*; @@ -99,10 +92,9 @@ public class FisherStrand extends StrandBiasTest implements StandardAnnotation, private final static boolean ENABLE_DEBUGGING = false; private final static Logger logger = Logger.getLogger(FisherStrand.class); - private static final String FS = "FS"; private static final double MIN_PVALUE = 1E-320; private static final int MIN_QUAL_FOR_FILTERED_TEST = 17; - private static final int MIN_COUNT = 2; + private static final int MIN_COUNT = ARRAY_DIM; @Override protected Map calculateAnnotationFromGTfield(final GenotypesContext genotypes){ @@ -113,8 +105,8 @@ public class FisherStrand extends StrandBiasTest implements StandardAnnotation, @Override protected Map calculateAnnotationFromStratifiedContexts(final Map stratifiedContexts, final VariantContext vc){ - final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), -1, MIN_COUNT); - final int[][] tableFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), MIN_QUAL_FOR_FILTERED_TEST, MIN_COUNT); + final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAlternateAlleles(), -1, MIN_COUNT); + final int[][] tableFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAlternateAlleles(), MIN_QUAL_FOR_FILTERED_TEST, MIN_COUNT); printTable("unfiltered", tableNoFiltering); printTable("filtered", tableFiltering); return pValueForBestTable(tableFiltering, tableNoFiltering); @@ -159,15 +151,17 @@ public class FisherStrand extends StrandBiasTest implements StandardAnnotation, */ protected Map annotationForOneTable(final double pValue) { final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(Math.max(pValue, MIN_PVALUE))); // prevent INFINITYs - return Collections.singletonMap(FS, value); + return Collections.singletonMap(getKeyNames().get(0), value); } + @Override public List getKeyNames() { - return Collections.singletonList(FS); + return Collections.singletonList(GATKVCFConstants.FISHER_STRAND_KEY); } + @Override public List getDescriptions() { - return Collections.singletonList(new VCFInfoHeaderLine(FS, 1, VCFHeaderLineType.Float, "Phred-scaled p-value using Fisher's exact test to detect strand bias")); + return Collections.singletonList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); } /** @@ -176,16 +170,20 @@ public class FisherStrand extends StrandBiasTest implements StandardAnnotation, * @return the array used by the per-sample Strand Bias annotation */ public static List getContingencyArray( final int[][] table ) { - if(table.length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); } - if(table[0].length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); } - final List list = new ArrayList<>(4); // TODO - if we ever want to do something clever with multi-allelic sites this will need to change + if(table.length != ARRAY_DIM || table[0].length != ARRAY_DIM) { + logger.warn("Expecting a " + ARRAY_DIM + "x" + ARRAY_DIM + " strand bias table."); + return null; + } + + final List list = new ArrayList<>(ARRAY_SIZE); // TODO - if we ever want to do something clever with multi-allelic sites this will need to change list.add(table[0][0]); list.add(table[0][1]); list.add(table[1][0]); list.add(table[1][1]); return list; } - private Double pValueForContingencyTable(int[][] originalTable) { + + public static Double pValueForContingencyTable(int[][] originalTable) { final int[][] normalizedTable = normalizeContingencyTable(originalTable); int[][] table = copyContingencyTable(normalizedTable); @@ -239,9 +237,9 @@ public class FisherStrand extends StrandBiasTest implements StandardAnnotation, final double normalizationFactor = (double)sum / TARGET_TABLE_SIZE; - final int[][] normalized = new int[2][2]; - for ( int i = 0; i < 2; i++ ) { - for ( int j = 0; j < 2; j++ ) + final int[][] normalized = new int[ARRAY_DIM][ARRAY_DIM]; + for ( int i = 0; i < ARRAY_DIM; i++ ) { + for ( int j = 0; j < ARRAY_DIM; j++ ) normalized[i][j] = (int)(table[i][j] / normalizationFactor); } @@ -249,10 +247,10 @@ public class FisherStrand extends StrandBiasTest implements StandardAnnotation, } private static int [][] copyContingencyTable(int [][] t) { - int[][] c = new int[2][2]; + int[][] c = new int[ARRAY_DIM][ARRAY_DIM]; - for ( int i = 0; i < 2; i++ ) - for ( int j = 0; j < 2; j++ ) + for ( int i = 0; i < ARRAY_DIM; i++ ) + for ( int j = 0; j < ARRAY_DIM; j++ ) c[i][j] = t[i][j]; return c; @@ -271,28 +269,28 @@ public class FisherStrand extends StrandBiasTest implements StandardAnnotation, */ private void printTable(final String name, final int[][] table) { if ( ENABLE_DEBUGGING ) { - final String pValue = (String)annotationForOneTable(pValueForContingencyTable(table)).get(FS); + final String pValue = (String)annotationForOneTable(pValueForContingencyTable(table)).get(getKeyNames().get(0)); logger.info(String.format("FS %s (REF+, REF-, ALT+, ALT-) = (%d, %d, %d, %d) = %s", name, table[0][0], table[0][1], table[1][0], table[1][1], pValue)); } } private static boolean rotateTable(int[][] table) { - table[0][0] -= 1; - table[1][0] += 1; + table[0][0]--; + table[1][0]++; - table[0][1] += 1; - table[1][1] -= 1; + table[0][1]++; + table[1][1]--; return (table[0][0] >= 0 && table[1][1] >= 0); } private static boolean unrotateTable(int[][] table) { - table[0][0] += 1; - table[1][0] -= 1; + table[0][0]++; + table[1][0]--; - table[0][1] -= 1; - table[1][1] += 1; + table[0][1]--; + table[1][1]++; return (table[0][1] >= 0 && table[1][0] >= 0); } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GCContent.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GCContent.java index 54535e32c..6dc097583 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GCContent.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GCContent.java @@ -51,20 +51,17 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.BaseUtils; -import org.broadinstitute.gatk.utils.help.HelpConstants; -import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.Arrays; import java.util.HashMap; @@ -89,14 +86,14 @@ public class GCContent extends InfoFieldAnnotation { final VariantContext vc, final Map stratifiedPerReadAlleleLikelihoodMap) { double content = computeGCContent(ref); - Map map = new HashMap(); + Map map = new HashMap<>(); map.put(getKeyNames().get(0), String.format("%.2f", content)); return map; } - public List getKeyNames() { return Arrays.asList("GC"); } + public List getKeyNames() { return Arrays.asList(GATKVCFConstants.GC_CONTENT_KEY); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("GC", 1, VCFHeaderLineType.Integer, "GC content around the variant (see docs for window size details)")); } + public List getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0)));} public boolean useZeroQualityReads() { return false; } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GenotypeSummaries.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GenotypeSummaries.java index 2460e45be..7f01f56db 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GenotypeSummaries.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GenotypeSummaries.java @@ -51,36 +51,39 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.*; /** - * Genotype summary statistics + * Summarize genotype statistics from all samples at the site level * + *

This annotation collects several genotype-level statistics from all samples and summarizes them in the INFO field. The following statistics are collected:

+ *
    + *
  • Number of called chromosomes (should amount to ploidy * called samples)
  • + *
  • Number of no-called samples
  • + *
  • p-value from Hardy-Weinberg Equilibrium test
  • + *
  • Mean of all GQ values
  • + *
  • Standard deviation of all GQ values
  • + *
+ *

Note

*

These summaries can all be recomputed from the genotypes on the fly but it is a lot faster to add them here as INFO field annotations.

*/ public class GenotypeSummaries extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation { - public final static String CCC = "CCC"; - public final static String NCC = "NCC"; - public final static String HWP = "HWP"; - public final static String GQ_MEAN = "GQ_MEAN"; - public final static String GQ_STDDEV = "GQ_STDDEV"; - @Override public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, @@ -92,7 +95,7 @@ public class GenotypeSummaries extends InfoFieldAnnotation implements ActiveRegi return null; final Map returnMap = new HashMap<>(); - returnMap.put(NCC, vc.getNoCallCount()); + returnMap.put(GATKVCFConstants.NOCALL_CHROM_KEY, vc.getNoCallCount()); final MathUtils.RunningAverage average = new MathUtils.RunningAverage(); for( final Genotype g : vc.getGenotypes() ) { @@ -101,9 +104,9 @@ public class GenotypeSummaries extends InfoFieldAnnotation implements ActiveRegi } } if( average.observationCount() > 0L ) { - returnMap.put(GQ_MEAN, String.format("%.2f", average.mean())); + returnMap.put(GATKVCFConstants.GQ_MEAN_KEY, String.format("%.2f", average.mean())); if( average.observationCount() > 1L ) { - returnMap.put(GQ_STDDEV, String.format("%.2f", average.stddev())); + returnMap.put(GATKVCFConstants.GQ_STDEV_KEY, String.format("%.2f", average.stddev())); } } @@ -112,17 +115,9 @@ public class GenotypeSummaries extends InfoFieldAnnotation implements ActiveRegi @Override public List getKeyNames() { - return Arrays.asList(CCC, NCC, HWP, GQ_MEAN, GQ_STDDEV); - } - - @Override - public List getDescriptions() { return Arrays.asList( - new VCFInfoHeaderLine(CCC, 1, VCFHeaderLineType.Integer, "Number of called chromosomes"), - new VCFInfoHeaderLine(NCC, 1, VCFHeaderLineType.Integer, "Number of no-called samples"), - new VCFInfoHeaderLine(HWP, 1, VCFHeaderLineType.Float, "P value from test of Hardy Weinberg Equilibrium"), - new VCFInfoHeaderLine(GQ_MEAN, 1, VCFHeaderLineType.Float, "Mean of all GQ values"), - new VCFInfoHeaderLine(GQ_STDDEV, 1, VCFHeaderLineType.Float, "Standard deviation of all GQ values") - ); + GATKVCFConstants.NOCALL_CHROM_KEY, + GATKVCFConstants.GQ_MEAN_KEY, + GATKVCFConstants.GQ_STDEV_KEY); } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HaplotypeScore.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HaplotypeScore.java index 9b5778c1d..44c38e757 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HaplotypeScore.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HaplotypeScore.java @@ -50,11 +50,12 @@ */ package org.broadinstitute.gatk.tools.walkers.annotator; - -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.tools.walkers.genotyper.UnifiedGenotyper; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; @@ -63,16 +64,16 @@ import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.QualityUtils; -import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import org.broadinstitute.gatk.utils.sam.AlignmentUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.io.Serializable; import java.util.*; @@ -87,17 +88,33 @@ import java.util.*; *

HaplotypeCaller does not output this annotation because it already evaluates haplotype segregation internally. This annotation is only informative (and available) for variants called by Unified Genotyper.

*/ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { + private final static Logger logger = Logger.getLogger(HaplotypeScore.class); + private boolean walkerIdentityCheckWarningLogged = false; + private final static boolean DEBUG = false; private final static int MIN_CONTEXT_WING_SIZE = 10; private final static int MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER = 50; private final static char REGEXP_WILDCARD = '.'; + @Override public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, final ReferenceContext ref, final Map stratifiedContexts, final VariantContext vc, final Map stratifiedPerReadAlleleLikelihoodMap) { + // Can only call from UnifiedGenotyper + if ( !(walker instanceof UnifiedGenotyper) ) { + if ( !walkerIdentityCheckWarningLogged ) { + if ( walker != null ) + logger.warn("Annotation will not be calculated, must be called from UnifiedGenotyper, not " + walker.getClass().getName()); + else + logger.warn("Annotation will not be calculated, must be called from UnifiedGenotyper"); + walkerIdentityCheckWarningLogged = true; + } + return null; + } + if (vc.isSNP() && stratifiedContexts != null) return annotatePileup(ref, stratifiedContexts, vc); else @@ -108,7 +125,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot final Map stratifiedContexts, final VariantContext vc) { - if (stratifiedContexts.size() == 0) // size 0 means that call was made by someone else and we have no data here + if (stratifiedContexts.isEmpty()) // empty means that call was made by someone else and we have no data here return null; final AlignmentContext context = AlignmentContextUtils.joinContexts(stratifiedContexts.values()); @@ -135,7 +152,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot } // annotate the score in the info field - final Map map = new HashMap(); + final Map map = new HashMap<>(); map.put(getKeyNames().get(0), String.format("%.4f", scoreRA.mean())); return map; } @@ -157,8 +174,8 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot int haplotypesToCompute = vc.getAlternateAlleles().size() + 1; - final PriorityQueue candidateHaplotypeQueue = new PriorityQueue(100, new HaplotypeComparator()); - final PriorityQueue consensusHaplotypeQueue = new PriorityQueue(MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER, new HaplotypeComparator()); + final PriorityQueue candidateHaplotypeQueue = new PriorityQueue<>(100, new HaplotypeComparator()); + final PriorityQueue consensusHaplotypeQueue = new PriorityQueue<>(MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER, new HaplotypeComparator()); for (final PileupElement p : pileup) { final Haplotype haplotypeFromRead = getHaplotypeFromRead(p, contextSize, locus); @@ -198,7 +215,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot // The consensus haplotypes are in a quality-ordered priority queue, so the best haplotypes are just the ones at the front of the queue final Haplotype haplotype1 = consensusHaplotypeQueue.poll(); - List hlist = new ArrayList(); + List hlist = new ArrayList<>(); hlist.add(new Haplotype(haplotype1.getBases(), 60)); for (int k = 1; k < haplotypesToCompute; k++) { @@ -313,7 +330,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot if (DEBUG) System.out.printf("HAP1: %s%n", haplotypes.get(0)); if (DEBUG) System.out.printf("HAP2: %s%n", haplotypes.get(1)); - final ArrayList haplotypeScores = new ArrayList(); + final ArrayList haplotypeScores = new ArrayList<>(); for (final PileupElement p : pileup) { // Score all the reads in the pileup, even the filtered ones final double[] scores = new double[haplotypes.size()]; @@ -394,12 +411,14 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot return mismatches - expected; } + @Override public List getKeyNames() { - return Arrays.asList("HaplotypeScore"); + return Arrays.asList(GATKVCFConstants.HAPLOTYPE_SCORE_KEY); } + @Override public List getDescriptions() { - return Arrays.asList(new VCFInfoHeaderLine("HaplotypeScore", 1, VCFHeaderLineType.Float, "Consistency of the site with at most two segregating haplotypes")); + return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); } private static class Haplotype { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HardyWeinberg.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HardyWeinberg.java index b511a1b90..9bf17a721 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HardyWeinberg.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HardyWeinberg.java @@ -52,25 +52,23 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import htsjdk.tribble.util.popgen.HardyWeinbergCalculation; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.apache.log4j.Logger; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.WorkInProgressAnnotation; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.QualityUtils; -import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.GenotypesContext; import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** @@ -83,17 +81,20 @@ import java.util.Map; * *

Caveats

*
    - *
  • This annotation requires multiple samples and a valid pedigree file.
  • + *
  • This annotation requires multiple samples.
  • *
  • This is an experimental annotation. As such, it is unsupported; we do not make any guarantees that it will work properly, and you use it at your own risk.
  • *
  • Low confidence genotypes are ignored, which may adversely affect HW ratios. More analysis is needed to determine the right thing to do when the genotyper cannot decide whether a given sample is heterozygous or homozygous variant.
  • *
*/ public class HardyWeinberg extends InfoFieldAnnotation implements ExperimentalAnnotation { + private final static Logger logger = Logger.getLogger(HardyWeinberg.class); private static final int MIN_SAMPLES = 10; private static final int MIN_GENOTYPE_QUALITY = 10; private static final int MIN_LOG10_PERROR = MIN_GENOTYPE_QUALITY / 10; + private boolean warningLogged = false; + @Override public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, final ReferenceContext ref, @@ -102,8 +103,13 @@ public class HardyWeinberg extends InfoFieldAnnotation implements ExperimentalAn final Map stratifiedPerReadAlleleLikelihoodMap) { final GenotypesContext genotypes = vc.getGenotypes(); - if ( genotypes == null || genotypes.size() < MIN_SAMPLES ) + if ( genotypes == null || genotypes.size() < MIN_SAMPLES ) { + if ( !warningLogged ) { + logger.warn("Too few genotypes"); + warningLogged = true; + } return null; + } int refCount = 0; int hetCount = 0; @@ -132,12 +138,14 @@ public class HardyWeinberg extends InfoFieldAnnotation implements ExperimentalAn double pvalue = HardyWeinbergCalculation.hwCalculate(refCount, hetCount, homCount); //System.out.println(refCount + " " + hetCount + " " + homCount + " " + pvalue); - Map map = new HashMap(); + Map map = new HashMap<>(); map.put(getKeyNames().get(0), String.format("%.1f", QualityUtils.phredScaleErrorRate(pvalue))); return map; } - public List getKeyNames() { return Arrays.asList("HW"); } + @Override + public List getKeyNames() { return Arrays.asList(GATKVCFConstants.HARDY_WEINBERG_KEY); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("HW", 1, VCFHeaderLineType.Float, "Phred-scaled p-value for Hardy-Weinberg violation")); } + @Override + public List getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); } } \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HomopolymerRun.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HomopolymerRun.java index 4c77ffff4..4bc2151fe 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HomopolymerRun.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HomopolymerRun.java @@ -51,17 +51,18 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.GenomeLoc; -import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.Arrays; import java.util.HashMap; @@ -103,14 +104,14 @@ public class HomopolymerRun extends InfoFieldAnnotation implements ExperimentalA return null; } - Map map = new HashMap(); + Map map = new HashMap<>(); map.put(getKeyNames().get(0), String.format("%d", run)); return map; } - public List getKeyNames() { return Arrays.asList("HRun"); } + public List getKeyNames() { return Arrays.asList(GATKVCFConstants.HOMOPOLYMER_RUN_KEY); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("HRun", 1, VCFHeaderLineType.Integer, "Largest Contiguous Homopolymer Run of Variant Allele In Either Direction")); } + public List getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); } public boolean useZeroQualityReads() { return false; } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java index e44ff0635..458a1b696 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java @@ -52,9 +52,10 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import htsjdk.variant.variantcontext.Allele; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.Walker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; @@ -62,11 +63,12 @@ import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnot import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.MathUtils; -import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.GenotypesContext; import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.*; @@ -80,15 +82,21 @@ import java.util.*; *

The calculation is a continuous generalization of the Hardy-Weinberg test for disequilibrium that works well with limited coverage per sample. The output is a Phred-scaled p-value derived from running the HW test for disequilibrium with PL values. See the method document on statistical tests for a more detailed explanation of this statistical test.

* *

Caveats

- *

Note that the Inbreeding Coefficient can only be calculated for cohorts containing at least 10 founder samples.

+ *
    + *
  • The Inbreeding Coefficient can only be calculated for cohorts containing at least 10 founder samples.
  • + *
  • This annotation is used in variant recalibration, but may not be appropriate for that purpose if the cohort being analyzed contains many closely related individuals.
  • + *
  • This annotation requires a valid pedigree file.
  • + *
* */ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { + private final static Logger logger = Logger.getLogger(InbreedingCoeff.class); private static final int MIN_SAMPLES = 10; - private static final String INBREEDING_COEFFICIENT_KEY_NAME = "InbreedingCoeff"; private Set founderIds; private int sampleCount; + private boolean pedigreeCheckWarningLogged = false; + private boolean didUniquifiedSampleNameCheck = false; @Override public Map annotate(final RefMetaDataTracker tracker, @@ -98,9 +106,24 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno final VariantContext vc, final Map perReadAlleleLikelihoodMap ) { //If available, get the founder IDs and cache them. the IC will only be computed on founders then. - if(founderIds == null && walker != null) - founderIds = ((Walker)walker).getSampleDB().getFounderIds(); - return makeCoeffAnnotation(vc); + if(founderIds == null && walker != null) { + founderIds = ((Walker) walker).getSampleDB().getFounderIds(); + } + //if none of the "founders" are in the vc samples, assume we uniquified the samples upstream and they are all founders + if (!didUniquifiedSampleNameCheck) { + checkSampleNames(vc); + didUniquifiedSampleNameCheck = true; + } + if ( founderIds == null || founderIds.isEmpty() ) { + if ( !pedigreeCheckWarningLogged ) { + logger.warn("Annotation will not be calculated, must provide a valid PED file (-ped) from the command line."); + pedigreeCheckWarningLogged = true; + } + return null; + } + else{ + return makeCoeffAnnotation(vc); + } } protected double calculateIC(final VariantContext vc, final GenotypesContext genotypes) { @@ -124,7 +147,7 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno { if (g.isHetNonRef()) { //all likelihoods go to homCount - homCount += 1; + homCount++; continue; } @@ -168,9 +191,25 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno return Collections.singletonMap(getKeyNames().get(0), (Object)String.format("%.4f", F)); } - @Override - public List getKeyNames() { return Collections.singletonList(INBREEDING_COEFFICIENT_KEY_NAME); } + //this method is intended to reconcile uniquified sample names + // it comes into play when calling this annotation from GenotypeGVCFs with --uniquifySamples because founderIds + // is derived from the sampleDB, which comes from the input sample names, but vc will have uniquified (i.e. different) + // sample names. Without this check, the founderIds won't be found in the vc and the annotation won't be calculated. + protected void checkSampleNames(final VariantContext vc) { + Set vcSamples = new HashSet<>(); + vcSamples.addAll(vc.getSampleNames()); + if (!vcSamples.isEmpty()) { + if (founderIds!=null) { + vcSamples.removeAll(founderIds); + if (vcSamples.equals(vc.getSampleNames())) + founderIds = vc.getSampleNames(); + } + } + } @Override - public List getDescriptions() { return Collections.singletonList(new VCFInfoHeaderLine(INBREEDING_COEFFICIENT_KEY_NAME, 1, VCFHeaderLineType.Float, "Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation")); } + public List getKeyNames() { return Collections.singletonList(GATKVCFConstants.INBREEDING_COEFFICIENT_KEY); } + + @Override + public List getDescriptions() { return Collections.singletonList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); } } \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LikelihoodRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LikelihoodRankSumTest.java index fd383c0e3..d18302e25 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LikelihoodRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LikelihoodRankSumTest.java @@ -53,14 +53,15 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.Arrays; import java.util.List; /** - * Rank Sum Test of per-read likelihoods of REF vs. ALT reads + * Rank Sum Test of per-read likelihoods of REF versus ALT reads * *

This variant-level annotation compares the likelihoods of reads to their best haplotype match, between reads that support the reference allele and those that support the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have lower likelihoods to their best haplotype match than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have higher likelihoods to their best haplotype match than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact.

* @@ -73,10 +74,10 @@ import java.util.List; */ public class LikelihoodRankSumTest extends RankSumTest { @Override - public List getKeyNames() { return Arrays.asList("LikelihoodRankSum"); } + public List getKeyNames() { return Arrays.asList(GATKVCFConstants.LIKELIHOOD_RANK_SUM_KEY); } @Override - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("LikelihoodRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt Vs. Ref haplotype likelihoods")); } + public List getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); } @Override protected Double getElementForRead(final GATKSAMRecord read, final int refLoc, final MostLikelyAllele mostLikelyAllele) { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MVLikelihoodRatio.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MVLikelihoodRatio.java index a30924187..1cc87240b 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MVLikelihoodRatio.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MVLikelihoodRatio.java @@ -51,27 +51,28 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.samples.Trio; import org.broadinstitute.gatk.engine.walkers.Walker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.RodRequiringAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.gatk.utils.MendelianViolation; -import htsjdk.variant.vcf.VCFHeaderLineType; +import org.broadinstitute.gatk.engine.samples.MendelianViolation; import htsjdk.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.gatk.utils.exceptions.UserException; import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.*; /** * Likelihood of being a Mendelian Violation * - *

This annotation uses the likelihoods of the genotype calls to assess whether a site is transmitted from parents to offspring according to Mendelian rules. The output is the likelihood of the site being a Mendelian violation, which can be tentatively interpreted either as an indication of error (in the genotype calls) or as a possible mutation. The higher the output value, the more likely there is to be a Mendelian violation. Note that only positive values indicating likely MVs will be annotated; if the value for a given site is negative (indicating that there is no violation) the annotation is not written to the file.

+ *

This annotation uses the likelihoods of the genotype calls to assess whether a site is transmitted from parents to offspring according to Mendelian rules. The output is the likelihood of the site being a Mendelian violation, which can be tentatively interpreted either as an indication of error (in the genotype calls) or as a possible mutation. The higher the output value, the more likely there is to be a Mendelian violation. Note that only positive values indicating likely MVs will be annotated; if the value for a given site is negative (indicating that there is no violation) the annotation is not written to the file.

* *

Statistical notes

*

This annotation considers all possible combinations of all possible genotypes (homozygous-reference, heterozygous, and homozygous-variant) for each member of a trio, which amounts to 27 possible combinations. Using the Phred-scaled genotype likelihoods (PL values) from each individual, the likelihood of each combination is calculated, and the result contributes to the likelihood of the corresponding case (mendelian violation or non-violation) depending on which set it belongs to. See the method document on statistical tests for a more detailed explanation of this statistical test.

@@ -81,7 +82,7 @@ import java.util.*; *
  • The calculation assumes that the organism is diploid.
  • *
  • This annotation requires a valid pedigree file.
  • *
  • When multiple trios are present, the annotation is simply the maximum of the likelihood ratios, rather than the strict 1-Prod(1-p_i) calculation, as this can scale poorly for uncertain sites and many trios.
  • - *
  • This annotation can only be used from the Variant Annotator. If you attempt to use it from the UnifiedGenotyper, the run will fail with an error message to that effect. If you attempt to use it from the HaplotypeCaller, the run will complete successfully but the annotation will not be added to any variants.
  • + *
  • This annotation can only be used from the VariantAnnotator. If you attempt to use it from the UnifiedGenotyper, the run will fail with an error message to that effect. If you attempt to use it from the HaplotypeCaller, the run will complete successfully but the annotation will not be added to any variants.
  • * * *

    Related annotations

    @@ -93,9 +94,11 @@ import java.util.*; public class MVLikelihoodRatio extends InfoFieldAnnotation implements RodRequiringAnnotation { + private final static Logger logger = Logger.getLogger(MVLikelihoodRatio.class); private MendelianViolation mendelianViolation = null; - public static final String MVLR_KEY = "MVLR"; private Set trios; + private boolean walkerIdentityCheckWarningLogged = false; + private boolean pedigreeCheckWarningLogged = false; public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, @@ -103,17 +106,33 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements RodRequiri final Map stratifiedContexts, final VariantContext vc, final Map stratifiedPerReadAlleleLikelihoodMap) { - if ( mendelianViolation == null ) { - trios = ((Walker) walker).getSampleDB().getTrios(); - if ( trios.size() > 0 ) { - mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP ); - } - else { - throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid PED file (-ped) from the command line."); + + // Can only be called from VariantAnnotator + if ( !(walker instanceof VariantAnnotator) ) { + if ( !walkerIdentityCheckWarningLogged ) { + if ( walker != null ) + logger.warn("Annotation will not be calculated, must be called from VariantAnnotator, not " + walker.getClass().getName()); + else + logger.warn("Annotation will not be calculated, must be called from VariantAnnotator"); + walkerIdentityCheckWarningLogged = true; } + return null; } - Map attributeMap = new HashMap(1); + if ( mendelianViolation == null ) { + // Must have a pedigree file + trios = ((Walker) walker).getSampleDB().getTrios(); + if ( trios.isEmpty() ) { + if ( !pedigreeCheckWarningLogged ) { + logger.warn("Annotation will not be calculated, mendelian violation annotation must provide a valid PED file (-ped) from the command line."); + pedigreeCheckWarningLogged = true; + } + return null; + } + mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP ); + } + + Map attributeMap = new HashMap<>(1); //double pNoMV = 1.0; double maxMVLR = Double.MIN_VALUE; for ( Trio trio : trios ) { @@ -127,15 +146,16 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements RodRequiri //double pSomeMV = 1.0-pNoMV; //toRet.put("MVLR",Math.log10(pSomeMV)-Math.log10(1.0-pSomeMV)); if ( Double.compare(maxMVLR,Double.MIN_VALUE) != 0 ) - attributeMap.put(MVLR_KEY,maxMVLR); + attributeMap.put(getKeyNames().get(0), maxMVLR); return attributeMap; } // return the descriptions used for the VCF INFO meta field - public List getKeyNames() { return Arrays.asList(MVLR_KEY); } - - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(MVLR_KEY, 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); } + @Override + public List getKeyNames() { return Arrays.asList(GATKVCFConstants.MENDEL_VIOLATION_LR_KEY); } + @Override + public List getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); } private boolean contextHasTrioLikelihoods(VariantContext context, Trio trio) { for ( String sample : Arrays.asList(trio.getMaternalID(),trio.getPaternalID(),trio.getChildID()) ) { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java index 9da84183e..883c878b6 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityRankSumTest.java @@ -53,15 +53,16 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.gatk.utils.pileup.PileupElement; -import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.*; /** - * Rank Sum Test for mapping qualities of REF vs. ALT reads + * Rank Sum Test for mapping qualities of REF versus ALT reads * *

    This variant-level annotation compares the mapping qualities of the reads supporting the reference allele with those supporting the alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the reads supporting the alternate allele have lower mapping quality scores than those supporting the reference allele. Conversely, a positive value indicates that the reads supporting the alternate allele have higher mapping quality scores than those supporting the reference allele.

    *

    This annotation can be used to evaluate confidence in a variant call and is a recommended covariate for variant recalibration (VQSR). Finding a statistically significant difference in quality either way suggests that the sequencing and/or mapping process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants. @@ -80,10 +81,10 @@ import java.util.*; */ public class MappingQualityRankSumTest extends RankSumTest implements StandardAnnotation { @Override - public List getKeyNames() { return Arrays.asList("MQRankSum"); } + public List getKeyNames() { return Arrays.asList(GATKVCFConstants.MAP_QUAL_RANK_SUM_KEY); } @Override - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MQRankSum", 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities")); } + public List getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); } @Override protected Double getElementForRead(final GATKSAMRecord read, final int refLoc) { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZero.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZero.java index 49bc74161..2b81d45c4 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZero.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZero.java @@ -51,13 +51,13 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardUGAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFInfoHeaderLine; @@ -78,6 +78,9 @@ import java.util.Map; * *

    This anotation gives you the count of all reads that have MAPQ = 0 across all samples. The count of reads with MAPQ0 can be used for quality control; high counts typically indicate regions where it is difficult to make confident calls.

    * + *

    Caveat

    + *

    It is not useful to apply this annotation with HaplotypeCaller because HC filters out all reads with MQ0 upfront, so the annotation will always return a value of 0.

    + * *

    Related annotations

    *
      *
    • MappingQualityZeroBySample gives the count of reads with MAPQ=0 for each individual sample.
    • @@ -85,7 +88,7 @@ import java.util.Map; *
    * */ -public class MappingQualityZero extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { +public class MappingQualityZero extends InfoFieldAnnotation implements StandardUGAnnotation, ActiveRegionBasedAnnotation { public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, @@ -104,7 +107,7 @@ public class MappingQualityZero extends InfoFieldAnnotation implements StandardA private Map annotatePileup(final ReferenceContext ref, final Map stratifiedContexts, final VariantContext vc) { - if ( stratifiedContexts.size() == 0 ) + if ( stratifiedContexts.isEmpty() ) return null; int mq0 = 0; @@ -123,7 +126,7 @@ public class MappingQualityZero extends InfoFieldAnnotation implements StandardA private Map annotateWithLikelihoods(final Map stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) { - if (stratifiedPerReadAlleleLikelihoodMap == null) + if ( stratifiedPerReadAlleleLikelihoodMap == null ) return null; int mq0 = 0; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/PossibleDeNovo.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/PossibleDeNovo.java index 1f677c6d0..6471488a5 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/PossibleDeNovo.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/PossibleDeNovo.java @@ -51,22 +51,21 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.samples.Sample; +import org.apache.log4j.Logger; + import org.broadinstitute.gatk.engine.samples.Trio; import org.broadinstitute.gatk.engine.walkers.Walker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.RodRequiringAnnotation; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.gatk.utils.MendelianViolation; -import htsjdk.variant.vcf.VCFHeaderLineType; -import htsjdk.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.engine.samples.MendelianViolation; import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import java.util.*; @@ -82,7 +81,7 @@ import java.util.*; *
  • Only reports possible de novos for children whose genotypes have not been tagged as filtered (which is most appropriate if parent likelihoods * have already been factored in using PhaseByTransmission).
  • *
  • When multiple trios are present, the annotation is simply the maximum of the likelihood ratios, rather than the strict 1-Prod(1-p_i) calculation, as this can scale poorly for uncertain sites and many trios.
  • - *
  • This annotation can only be used from the Variant Annotator.If you attempt to use it from the UnifiedGenotyper, the run will fail with an error message to that effect. If you attempt to use it from the HaplotypeCaller, the run will complete successfully but the annotation will not be added to any variants.
  • + *
  • This annotation can only be used from the Variant Annotator. If you attempt to use it from the UnifiedGenotyper, the run will fail with an error message to that effect. If you attempt to use it from the HaplotypeCaller, the run will complete successfully but the annotation will not be added to any variants.
  • * * *

    Related annotations

    @@ -94,14 +93,16 @@ import java.util.*; public class PossibleDeNovo extends InfoFieldAnnotation implements RodRequiringAnnotation, ExperimentalAnnotation { + private final static Logger logger = Logger.getLogger(PossibleDeNovo.class); + private MendelianViolation mendelianViolation = null; - public static final String HI_CONF_DENOVO_KEY = "hiConfDeNovo"; - public static final String LO_CONF_DENOVO_KEY = "loConfDeNovo"; - private final int hi_GQ_threshold = 20; - private final int lo_GQ_threshold = 10; + private final int hi_GQ_threshold = 20; //WARNING - If you change this value, update the description in GATKVCFHeaderLines + private final int lo_GQ_threshold = 10; //WARNING - If you change this value, update the description in GATKVCFHeaderLines private final double percentOfSamplesCutoff = 0.001; //for many, many samples use 0.1% of samples as allele frequency threshold for de novos private final int flatNumberOfSamplesCutoff = 4; private Set trios; + private boolean walkerIdentityCheckWarningLogged = false; + private boolean pedigreeCheckWarningLogged = false; public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, @@ -109,21 +110,35 @@ public class PossibleDeNovo extends InfoFieldAnnotation implements RodRequiringA final Map stratifiedContexts, final VariantContext vc, final Map stratifiedPerReadAlleleLikelihoodMap) { - if ( mendelianViolation == null ) { - trios = ((Walker) walker).getSampleDB().getTrios(); - if ( trios.size() > 0 ) { - mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP ); - } - else { - throw new UserException("Possible de novos annotation can only be used from the Variant Annotator, and must be provided a valid PED file (-ped) from the command line."); + + if ( !(walker instanceof VariantAnnotator ) ) { + if ( !walkerIdentityCheckWarningLogged ) { + if ( walker != null ) + logger.warn("Annotation will not be calculated, must be called from VariantAnnotator, not " + walker.getClass().getName()); + else + logger.warn("Annotation will not be calculated, must be called from VariantAnnotator"); + walkerIdentityCheckWarningLogged = true; } + return null; } - final Map attributeMap = new HashMap(1); + if ( mendelianViolation == null ) { + trios = ((Walker) walker).getSampleDB().getTrios(); + if ( trios.isEmpty() ) { + if ( !pedigreeCheckWarningLogged ) { + logger.warn("Annotation will not be calculated, must provide a valid PED file (-ped) from the command line."); + pedigreeCheckWarningLogged = true; + } + return null; + } + mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP ); + } + + final Map attributeMap = new HashMap<>(1); boolean isHighConfDeNovo = false; boolean isLowConfDeNovo = false; - final List highConfDeNovoChildren = new ArrayList(); - final List lowConfDeNovoChildren = new ArrayList(); + final List highConfDeNovoChildren = new ArrayList<>(); + final List lowConfDeNovoChildren = new ArrayList<>(); for ( final Trio trio : trios ) { if (vc.isBiallelic() && contextHasTrioLikelihoods(vc,trio) && mendelianViolation.isViolation(trio.getMother(),trio.getFather(),trio.getChild(),vc) ) { @@ -146,18 +161,15 @@ public class PossibleDeNovo extends InfoFieldAnnotation implements RodRequiringA final double AFcutoff = Math.max(flatNumberOfSamplesCutoff,percentNumberOfSamplesCutoff); final int deNovoAlleleCount = vc.getCalledChrCount(vc.getAlternateAllele(0)); //we assume we're biallelic above so use the first alt if ( isHighConfDeNovo && deNovoAlleleCount < AFcutoff ) - attributeMap.put(HI_CONF_DENOVO_KEY,highConfDeNovoChildren); + attributeMap.put(GATKVCFConstants.HI_CONF_DENOVO_KEY,highConfDeNovoChildren); if ( isLowConfDeNovo && deNovoAlleleCount < AFcutoff ) - attributeMap.put(LO_CONF_DENOVO_KEY,lowConfDeNovoChildren); + attributeMap.put(GATKVCFConstants.LO_CONF_DENOVO_KEY,lowConfDeNovoChildren); return attributeMap; } // return the descriptions used for the VCF INFO meta field - public List getKeyNames() { return Arrays.asList(HI_CONF_DENOVO_KEY,LO_CONF_DENOVO_KEY); } - - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(HI_CONF_DENOVO_KEY, 1, VCFHeaderLineType.String, "High confidence possible de novo mutation (GQ >= "+hi_GQ_threshold+" for all trio members)=[comma-delimited list of child samples]"), - new VCFInfoHeaderLine(LO_CONF_DENOVO_KEY, 1, VCFHeaderLineType.String, "Low confidence possible de novo mutation (GQ >= "+lo_GQ_threshold+" for child, GQ > 0 for parents)=[comma-delimited list of child samples]")); } - + @Override + public List getKeyNames() { return Arrays.asList(GATKVCFConstants.HI_CONF_DENOVO_KEY, GATKVCFConstants.LO_CONF_DENOVO_KEY); } private boolean contextHasTrioLikelihoods(VariantContext context, Trio trio) { for ( String sample : Arrays.asList(trio.getMaternalID(),trio.getPaternalID(),trio.getChildID()) ) { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/QualByDepth.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/QualByDepth.java index 004e5d18f..88390ad88 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/QualByDepth.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/QualByDepth.java @@ -51,18 +51,20 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.tools.walkers.genotyper.UnifiedGenotyper; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; -import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.GenotypesContext; @@ -78,7 +80,7 @@ import java.util.*; *

    Statistical notes

    *

    The calculation only takes into account coverage from samples genotyped as having the variant allele(s). This removes the influence of any homozygous-reference samples that might be present in the same cohort, which would otherwise penalize the call unfairly.

    * - *

    Caveats

    + *

    Caveat

    *

    This annotation can only be calculated for sites for which at least one sample was genotyped as carrying a variant allele.

    * *

    Related annotations

    @@ -151,9 +153,9 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati return null; final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc); - // Hack: when refContext == null then we know we are coming from the HaplotypeCaller and do not want to do a - // full length-based normalization (because the indel length problem is present only in the UnifiedGenotyper) - double QD = -10.0 * vc.getLog10PError() / ((double)standardDepth * indelNormalizationFactor(altAlleleLength, ref != null)); + // Hack: UnifiedGenotyper (but not HaplotypeCaller or GenotypeGVCFs) over-estimates the quality of long indels + // Penalize the QD calculation for UG indels to compensate for this + double QD = -10.0 * vc.getLog10PError() / ((double)standardDepth * indelNormalizationFactor(altAlleleLength, walker instanceof UnifiedGenotyper)); // Hack: see note in the fixTooHighQD method below QD = fixTooHighQD(QD); @@ -189,7 +191,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati if ( QD < MAX_QD_BEFORE_FIXING ) { return QD; } else { - return IDEAL_HIGH_QD + GenomeAnalysisEngine.getRandomGenerator().nextGaussian() * JITTER_SIGMA; + return IDEAL_HIGH_QD + Utils.getRandomGenerator().nextGaussian() * JITTER_SIGMA; } } @@ -197,10 +199,10 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati private final static double IDEAL_HIGH_QD = 30; private final static double JITTER_SIGMA = 3; - public List getKeyNames() { return Arrays.asList("QD"); } + public List getKeyNames() { return Arrays.asList(GATKVCFConstants.QUAL_BY_DEPTH_KEY); } public List getDescriptions() { - return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth")); + return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RMSMappingQuality.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RMSMappingQuality.java index fbeea3331..038545cf4 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RMSMappingQuality.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RMSMappingQuality.java @@ -51,9 +51,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java index eb70a19a2..c257a05ff 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java @@ -52,9 +52,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java index 11f59157a..2c49355f9 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java @@ -57,17 +57,18 @@ import htsjdk.samtools.CigarOperator; import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.gatk.tools.walkers.indels.PairHMMIndelErrorModel; -import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.sam.AlignmentUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.ReadUtils; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.*; /** - * Rank Sum Test for relative positioning of REF vs. ALT alleles within reads + * Rank Sum Test for relative positioning of REF versus ALT alleles within reads * *

    This variant-level annotation tests whether there is evidence of bias in the position of alleles within the reads that support them, between the reference and alternate alleles. Seeing an allele only near the ends of reads is indicative of error, because that is where sequencers tend to make the most errors. However, some variants located near the edges of sequenced regions will necessarily be covered by the ends of reads, so we can't just set an absolute "minimum distance from end of read" threshold. That is why we use a rank sum test to evaluate whether there is a difference in how well the reference allele and the alternate allele are supported.

    * @@ -85,11 +86,11 @@ import java.util.*; public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotation { @Override - public List getKeyNames() { return Arrays.asList("ReadPosRankSum"); } + public List getKeyNames() { return Arrays.asList(GATKVCFConstants.READ_POS_RANK_SUM_KEY); } @Override public List getDescriptions() { - return Arrays.asList(new VCFInfoHeaderLine("ReadPosRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias")); + return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); } @Override diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SampleList.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SampleList.java index b0f298048..c1c226a81 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SampleList.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SampleList.java @@ -51,17 +51,17 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; -import htsjdk.variant.vcf.VCFHeaderLineCount; -import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.Arrays; import java.util.HashMap; @@ -69,9 +69,9 @@ import java.util.List; import java.util.Map; /** - * List of samples that are polymorphic at a given site + * List samples that are non-reference at a given site * - *

    The output is a list of the samples that are genotyped as having one or more variant alleles. This allows you to easily determine which samples are polymorphic and compare them to samples that are homozygous-reference.

    + *

    The output is a list of the samples that are genotyped as having one or more variant alleles. This allows you to easily determine which samples are non-reference (heterozygous or homozygous-variant) and compare them to samples that are homozygous-reference.

    */ public class SampleList extends InfoFieldAnnotation { @@ -84,7 +84,7 @@ public class SampleList extends InfoFieldAnnotation { if ( vc.isMonomorphicInSamples() || !vc.hasGenotypes() ) return null; - StringBuffer samples = new StringBuffer(); + final StringBuilder samples = new StringBuilder(); for ( Genotype genotype : vc.getGenotypesOrderedByName() ) { if ( genotype.isCalled() && !genotype.isHomRef() ){ if ( samples.length() > 0 ) @@ -97,11 +97,11 @@ public class SampleList extends InfoFieldAnnotation { return null; Map map = new HashMap(); - map.put("Samples", samples.toString()); + map.put(getKeyNames().get(0), samples.toString()); return map; } - public List getKeyNames() { return Arrays.asList("Samples"); } + public List getKeyNames() { return Arrays.asList(GATKVCFConstants.SAMPLE_LIST_KEY); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("Samples", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "List of polymorphic samples")); } + public List getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SpanningDeletions.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SpanningDeletions.java index 6b7b21b30..1aeb79a6b 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SpanningDeletions.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SpanningDeletions.java @@ -51,22 +51,22 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.tools.walkers.genotyper.UnifiedGenotyper; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardUGAnnotation; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.pileup.PileupElement; -import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** @@ -76,20 +76,36 @@ import java.util.Map; * *

    Caveats

    *
      - *
    • This annotation is not compatible with HaplotypeCaller; its purpose is to compensate for the UnifiedGenotyper's inability to integrate SNPs and indels in the same model (unlike HaplotypeCaller)
    • + *
    • In its current form, this annotation is not compatible with HaplotypeCaller. It is only meant to be used with UnifiedGenotyper, as its purpose is to compensate for the UnifiedGenotyper's inability to integrate SNPs and indels in the same model (unlike HaplotypeCaller).
    • *
    • By default, the UnifiedGenotyper will not call variants where the fraction of spanning deletions is above a certain threshold. This threshold can be adjusted using the `--max_deletion_fraction` argument.
    • *
    * */ -public class SpanningDeletions extends InfoFieldAnnotation implements StandardAnnotation { +public class SpanningDeletions extends InfoFieldAnnotation implements StandardUGAnnotation { + private final static Logger logger = Logger.getLogger(SpanningDeletions.class); + private boolean walkerIdentityCheckWarningLogged = false; + + @Override public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, final ReferenceContext ref, final Map stratifiedContexts, final VariantContext vc, final Map stratifiedPerReadAlleleLikelihoodMap) { - if ( stratifiedContexts.size() == 0 ) + // Can only call from UnifiedGenotyper + if ( !(walker instanceof UnifiedGenotyper) ) { + if ( !walkerIdentityCheckWarningLogged ) { + if ( walker != null ) + logger.warn("Annotation will not be calculated, must be called from UnifiedGenotyper, not " + walker.getClass().getName()); + else + logger.warn("Annotation will not be calculated, must be called from UnifiedGenotyper"); + walkerIdentityCheckWarningLogged = true; + } + return null; + } + + if ( stratifiedContexts.isEmpty() ) return null; // not meaningful when we're at an indel location: deletions that start at location N are by definition called at the position N-1, and at position N-1 @@ -106,12 +122,14 @@ public class SpanningDeletions extends InfoFieldAnnotation implements StandardAn deletions++; } } - Map map = new HashMap(); + Map map = new HashMap<>(); map.put(getKeyNames().get(0), String.format("%.2f", depth == 0 ? 0.0 : (double)deletions/(double)depth)); return map; } - public List getKeyNames() { return Arrays.asList("Dels"); } + @Override + public List getKeyNames() { return Arrays.asList(GATKVCFConstants.SPANNING_DELETIONS_KEY); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("Dels", 1, VCFHeaderLineType.Float, "Fraction of Reads Containing Spanning Deletions")); } + @Override + public List getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); } } \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandAlleleCountsBySample.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandAlleleCountsBySample.java new file mode 100644 index 000000000..21632b5eb --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandAlleleCountsBySample.java @@ -0,0 +1,169 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.annotator; + +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFFormatHeaderLine; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele; +import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; + +import java.util.Collections; +import java.util.List; +import java.util.Map; + +/** + * Number of forward and reverse reads that support each allele + * + *

    The StrandAlleleCountsBySample annotation produces read counts per allele (including (REF) and per strand. Note that, as with the AD annotation, the allele counts here should not be used to make assumptions about the called genotype.

    + * + *

    This annotation produces 2 values per allele at each site, corresponding to the number of reads that support the following (in that order):

    + *
      + *
    • the reference allele on the forward strand
    • + *
    • the reference allele on the reverse strand
    • + *
    • the first alternate allele on the forward strand
    • + *
    • the first alternate allele on the reverse strand
    • + *
    • the second alternate allele on the forward strand
    • + *
    • ...etc
    • + *
    + * + *

    Example

    + *
    GT:AD:GQ:PL:SB:SAC       1/2:1,18,12:99:1022,326,382,537,0,487:1,0,4,8:1,0,3,15,4,8
    + *

    In this example, the reference allele is supported by 1 read on the forward strand, the first alternate allele is supported by 3 forward and 15 reverse reads, and the second alternate allele is supported by 4 forward and 8 reverse reads.

    + * + *

    Caveats

    + *
      + *
    • This annotation can only be generated by HaplotypeCaller (it will not work when called from VariantAnnotator).
    • + *
    + * + *

    Related annotations

    + *
      + *
    • DepthPerAlleleBySample displays the number of reads supporting each allele, without stratifying by strand.
    • + *
    + */ + + +public class StrandAlleleCountsBySample extends GenotypeAnnotation { + private final static Logger logger = Logger.getLogger(StrandAlleleCountsBySample.class); + boolean[] warningsLogged = new boolean[4]; + + @Override + public void annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final AlignmentContext stratifiedContext, + final VariantContext vc, + final Genotype g, + final GenotypeBuilder gb, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { + + if ( !AnnotationUtils.isAppropriateInput(walker, alleleLikelihoodMap, g, warningsLogged, logger) ) { + return; + } + + gb.attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, getStrandCounts(Collections.singletonMap(g.getSampleName(), alleleLikelihoodMap), vc)); + } + + @Override + public List getKeyNames() { return Collections.singletonList(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY); } + + @Override + public List getDescriptions() { + return Collections.singletonList(GATKVCFHeaderLines.getFormatLine(getKeyNames().get(0))); + } + + /** + * This method was inspired by (copied from) StrandBiasTest.getContingencyTable(). Unlike getContingencyTable, it + * returns values for all alleles rather than only reference and the most likely allele. Since this is not useful + * for StrandBias calculations, it's here and it skips the Nx2 table format used in that method + */ + private int[] getStrandCounts( final Map stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) { + if( stratifiedPerReadAlleleLikelihoodMap == null ) { throw new IllegalArgumentException("stratifiedPerReadAlleleLikelihoodMap cannot be null"); } + if( vc == null ) { throw new IllegalArgumentException("input vc cannot be null"); } + + final int[] table = new int[vc.getNAlleles()*2]; + + for (final PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) { + for (final Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { + final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + final GATKSAMRecord read = el.getKey(); + if (mostLikelyAllele.isInformative()) + updateTable(table, vc.getAlleleIndex(mostLikelyAllele.getAlleleIfInformative()), read); + } + } + + return table; + } + + private void updateTable(final int[] table, final int alleleIndex, final GATKSAMRecord read) { + if (alleleIndex < 0 || (alleleIndex+1)*2 > table.length) return; + final int offset = alleleIndex * 2; + + //Unstranded reads are not meaningful for this annotation, they can be found in the AD annotation + if (!read.isStrandless()) { + final boolean isFW = !read.getReadNegativeStrandFlag(); + table[offset + (isFW ? 0 : 1)]++; + } + } + +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasBySample.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasBySample.java index 143b16edd..fdfa06241 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasBySample.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasBySample.java @@ -51,9 +51,10 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; @@ -61,14 +62,15 @@ import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFFormatHeaderLine; -import htsjdk.variant.vcf.VCFHeaderLineType; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.*; /** * Number of forward and reverse reads that support REF and ALT alleles * - *

    Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other. The StrandBiasBySample annotation is produces read counts per allele and per strand that are used by other annotation modules (FisherStrand and StrandOddsRatio) to estimate strand bias using statistical approaches. + *

    Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other. The StrandBiasBySample annotation produces read counts per allele and per strand that are used by other annotation modules (FisherStrand and StrandOddsRatio) to estimate strand bias using statistical approaches. * *

    This annotation produces 4 values, corresponding to the number of reads that support the following (in that order):

    *
      @@ -82,6 +84,11 @@ import java.util.*; *
      GT:AD:GQ:PL:SB  0/1:53,51:99:1758,0,1835:23,30,33,18
      *

      In this example, the reference allele is supported by 23 forward reads and 30 reverse reads, the alternate allele is supported by 33 forward reads and 18 reverse reads.

      * + *

      Caveats

      + *
        + *
      • This annotation can only be generated by HaplotypeCaller (it will not work when called from VariantAnnotator).
      • + *
      + * *

      Related annotations

      *
        *
      • FisherStrand uses Fisher's Exact Test to evaluate strand bias.
      • @@ -91,8 +98,8 @@ import java.util.*; public class StrandBiasBySample extends GenotypeAnnotation { - - public final static String STRAND_BIAS_BY_SAMPLE_KEY_NAME = "SB"; + private final static Logger logger = Logger.getLogger(StrandBiasBySample.class); + boolean[] warningsLogged = new boolean[4]; @Override public void annotate(final RefMetaDataTracker tracker, @@ -103,21 +110,22 @@ public class StrandBiasBySample extends GenotypeAnnotation { final Genotype g, final GenotypeBuilder gb, final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { - if ( ! isAppropriateInput(alleleLikelihoodMap, g) ) + + if (!AnnotationUtils.isAppropriateInput(walker, alleleLikelihoodMap, g, warningsLogged, logger)) { return; + } final int[][] table = FisherStrand.getContingencyTable(Collections.singletonMap(g.getSampleName(), alleleLikelihoodMap), vc, 0); - gb.attribute(STRAND_BIAS_BY_SAMPLE_KEY_NAME, FisherStrand.getContingencyArray(table)); + gb.attribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY, FisherStrand.getContingencyArray(table)); } @Override - public List getKeyNames() { return Collections.singletonList(STRAND_BIAS_BY_SAMPLE_KEY_NAME); } + public List getKeyNames() { + return Collections.singletonList(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY); + } @Override - public List getDescriptions() { return Collections.singletonList(new VCFFormatHeaderLine(getKeyNames().get(0), 4, VCFHeaderLineType.Integer, "Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.")); } - - private boolean isAppropriateInput(final PerReadAlleleLikelihoodMap map, final Genotype g) { - return ! (map == null || g == null || !g.isCalled()); - } + public List getDescriptions() { + return Collections.singletonList(GATKVCFHeaderLines.getFormatLine(getKeyNames().get(0))); } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTest.java index 3791b7912..40d8cecf6 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTest.java @@ -53,14 +53,14 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFFormatHeaderLine; import htsjdk.variant.vcf.VCFHeaderLine; -import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.apache.log4j.Logger; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import htsjdk.variant.variantcontext.Genotype; @@ -70,6 +70,7 @@ import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import java.util.*; @@ -78,34 +79,33 @@ import java.util.*; */ public abstract class StrandBiasTest extends InfoFieldAnnotation { private final static Logger logger = Logger.getLogger(StrandBiasTest.class); + private static boolean stratifiedPerReadAlleleLikelihoodMapWarningLogged = false; + private static boolean inputVariantContextWarningLogged = false; + private static boolean getTableFromSamplesWarningLogged = false; + private static boolean decodeSBBSWarningLogged = false; + + protected static final int ARRAY_DIM = 2; + protected static final int ARRAY_SIZE = ARRAY_DIM * ARRAY_DIM; @Override public void initialize(final AnnotatorCompatible walker, final GenomeAnalysisEngine toolkit, final Set headerLines) { - boolean hasSBBSannotation = false; + // Does the VCF header contain strand bias (SB) by sample annotation? for ( final VCFHeaderLine line : headerLines) { if ( line instanceof VCFFormatHeaderLine) { final VCFFormatHeaderLine formatline = (VCFFormatHeaderLine)line; - if ( formatline.getID().equals(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME) ) { - hasSBBSannotation = true; - break; + if ( formatline.getID().equals(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY) ) { + logger.warn("StrandBiasBySample annotation exists in input VCF header. Attempting to use StrandBiasBySample " + + "values to calculate strand bias annotation values. If no sample has the SB genotype annotation, annotation may still fail."); + return; } } } - if (hasSBBSannotation) { - logger.info("StrandBiasBySample annotation exists in input VCF header. Attempting to use StrandBiasBySample " + - "values to calculate strand bias annotation values. If no sample has the SB genotype annotation, annotation may still fail."); - return; - } - - boolean hasReads = toolkit.getReadsDataSource().getReaderIDs().size() > 0; - if (hasReads) { + // Are there reads from a SAM/BAM file? + if (toolkit.getReadsDataSource().getReaderIDs().isEmpty()) + logger.warn("No StrandBiasBySample annotation or read data was found. Strand bias annotations will not be output."); + else logger.info("SAM/BAM data was found. Attempting to use read data to calculate strand bias annotations values."); - return; - } - - logger.info(new String("No StrandBiasBySample annotation or read data was found. Strand bias annotations will not be output.")); - } @Override @@ -116,34 +116,37 @@ public abstract class StrandBiasTest extends InfoFieldAnnotation { final Map stratifiedContexts, final VariantContext vc, final Map stratifiedPerReadAlleleLikelihoodMap) { + + // do not process if not a variant if ( !vc.isVariant() ) return null; + // if the genotype and strand bias are provided, calculate the annotation from the Genotype (GT) field if ( vc.hasGenotypes() ) { - boolean hasSB = false; for (final Genotype g : vc.getGenotypes()) { - if (g.hasAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME)) { - hasSB = true; - break; + if (g.hasAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY)) { + return calculateAnnotationFromGTfield(vc.getGenotypes()); } } - if (hasSB) - return calculateAnnotationFromGTfield(vc.getGenotypes()); } - //stratifiedContexts can come come from VariantAnnotator, but will be size 0 if no reads were provided - if (vc.isSNP() && stratifiedContexts != null && stratifiedContexts.size() > 0) { + // if a the variant is a snp and has stratified contexts, calculate the annotation from the stratified contexts + //stratifiedContexts can come come from VariantAnnotator, but will be empty if no reads were provided + if (vc.isSNP() && stratifiedContexts != null && !stratifiedContexts.isEmpty()) { return calculateAnnotationFromStratifiedContexts(stratifiedContexts, vc); } - //stratifiedPerReadAllelelikelihoodMap can come from HaplotypeCaller call to VariantAnnotatorEngine + // calculate the annotation from the stratified per read likelihood map + // stratifiedPerReadAllelelikelihoodMap can come from HaplotypeCaller call to VariantAnnotatorEngine else if (stratifiedPerReadAlleleLikelihoodMap != null) { return calculateAnnotationFromLikelihoodMap(stratifiedPerReadAlleleLikelihoodMap, vc); } - else - // for non-snp variants, we need per-read likelihoods. + else { + // for non-snp variants, we need per-read likelihoods. // for snps, we can get same result from simple pileup + // for indels that do not have a computed strand bias (SB) or strand bias by sample (SBBS) return null; + } } protected abstract Map calculateAnnotationFromGTfield(final GenotypesContext genotypes); @@ -162,17 +165,23 @@ public abstract class StrandBiasTest extends InfoFieldAnnotation { * @return the table used for several strand bias tests, will be null if none of the genotypes contain the per-sample SB annotation */ protected int[][] getTableFromSamples( final GenotypesContext genotypes, final int minCount ) { - if( genotypes == null ) { throw new IllegalArgumentException("Genotypes cannot be null."); } + if( genotypes == null ) { + if ( !getTableFromSamplesWarningLogged ) { + logger.warn("Genotypes cannot be null."); + getTableFromSamplesWarningLogged = true; + } + return null; + } final int[] sbArray = {0,0,0,0}; // reference-forward-reverse -by- alternate-forward-reverse boolean foundData = false; for( final Genotype g : genotypes ) { - if( g.isNoCall() || ! g.hasAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME) ) + if( g.isNoCall() || ! g.hasAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY) ) continue; foundData = true; - final String sbbsString = (String) g.getAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME); + final String sbbsString = (String) g.getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY); final int[] data = encodeSBBS(sbbsString); if ( passesMinimumThreshold(data, minCount) ) { for( int index = 0; index < sbArray.length; index++ ) { @@ -193,13 +202,13 @@ public abstract class StrandBiasTest extends InfoFieldAnnotation { */ protected static int[][] getSNPContingencyTable(final Map stratifiedContexts, final Allele ref, - final Allele alt, + final List allAlts, final int minQScoreToConsider, final int minCount ) { - int[][] table = new int[2][2]; + int[][] table = new int[ARRAY_DIM][ARRAY_DIM]; for (final Map.Entry sample : stratifiedContexts.entrySet() ) { - final int[] myTable = new int[4]; + final int[] myTable = new int[ARRAY_SIZE]; for (final PileupElement p : sample.getValue().getBasePileup()) { if ( ! isUsableBase(p) ) // ignore deletions and bad MQ @@ -208,11 +217,13 @@ public abstract class StrandBiasTest extends InfoFieldAnnotation { if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider ) continue; - updateTable(myTable, Allele.create(p.getBase(), false), p.getRead(), ref, alt); + updateTable(myTable, Allele.create(p.getBase(), false), p.getRead(), ref, allAlts); } - if ( passesMinimumThreshold( myTable, minCount ) ) + if ( passesMinimumThreshold( myTable, minCount ) ) { copyToMainTable(myTable, table); + } + } return table; @@ -228,19 +239,32 @@ public abstract class StrandBiasTest extends InfoFieldAnnotation { public static int[][] getContingencyTable( final Map stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc, final int minCount) { - if( stratifiedPerReadAlleleLikelihoodMap == null ) { throw new IllegalArgumentException("stratifiedPerReadAlleleLikelihoodMap cannot be null"); } - if( vc == null ) { throw new IllegalArgumentException("input vc cannot be null"); } + if( stratifiedPerReadAlleleLikelihoodMap == null ) { + if ( !stratifiedPerReadAlleleLikelihoodMapWarningLogged ) { + logger.warn("stratifiedPerReadAlleleLikelihoodMap cannot be null"); + stratifiedPerReadAlleleLikelihoodMapWarningLogged = true; + } + return null; + } + if( vc == null ) { + if ( !inputVariantContextWarningLogged ) { + logger.warn("input vc cannot be null"); + inputVariantContextWarningLogged = true; + } + return null; + } final Allele ref = vc.getReference(); final Allele alt = vc.getAltAlleleWithHighestAlleleCount(); - final int[][] table = new int[2][2]; + final List allAlts = vc.getAlternateAlleles(); + final int[][] table = new int[ARRAY_DIM][ARRAY_DIM]; for (final PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) { - final int[] myTable = new int[4]; + final int[] myTable = new int[ARRAY_SIZE]; for (final Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); final GATKSAMRecord read = el.getKey(); - updateTable(myTable, mostLikelyAllele.getAlleleIfInformative(), read, ref, alt); + updateTable(myTable, mostLikelyAllele.getAlleleIfInformative(), read, ref, allAlts); } if ( passesMinimumThreshold(myTable, minCount) ) copyToMainTable(myTable, table); @@ -277,13 +301,14 @@ public abstract class StrandBiasTest extends InfoFieldAnnotation { ((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); } - private static void updateTable(final int[] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final Allele alt) { + private static void updateTable(final int[] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final List allAlts) { final boolean matchesRef = allele.equals(ref, true); - final boolean matchesAlt = allele.equals(alt, true); + final boolean matchesAlt = allele.equals(allAlts.get(0), true); + final boolean matchesAnyAlt = allAlts.contains(allele); - if ( matchesRef || matchesAlt ) { - final int offset = matchesRef ? 0 : 2; + if ( matchesRef || matchesAnyAlt ) { + final int offset = matchesRef ? 0 : ARRAY_DIM; if ( read.isStrandless() ) { // a strandless read counts as observations on both strand, at 50% weight, with a minimum of 1 @@ -303,7 +328,7 @@ public abstract class StrandBiasTest extends InfoFieldAnnotation { * Does this strand data array pass the minimum threshold for inclusion? * * @param data the array - * @minCount The minimum threshold of counts in the array + * @param minCount The minimum threshold of counts in the array * @return true if it passes the minimum threshold, false otherwise */ protected static boolean passesMinimumThreshold(final int[] data, final int minCount) { @@ -317,9 +342,9 @@ public abstract class StrandBiasTest extends InfoFieldAnnotation { * @return the array used by the per-sample Strand Bias annotation */ private static int[] encodeSBBS( final String string ) { - final int[] array = new int[4]; + final int[] array = new int[ARRAY_SIZE]; final StringTokenizer tokenizer = new StringTokenizer(string, ",", false); - for( int index = 0; index < 4; index++ ) { + for( int index = 0; index < ARRAY_SIZE; index++ ) { array[index] = Integer.parseInt(tokenizer.nextToken()); } return array; @@ -331,8 +356,14 @@ public abstract class StrandBiasTest extends InfoFieldAnnotation { * @return the table used by the StrandOddsRatio annotation */ private static int[][] decodeSBBS( final int[] array ) { - if(array.length != 4) { throw new IllegalArgumentException("Expecting a length = 4 strand bias array."); } - final int[][] table = new int[2][2]; + if(array.length != ARRAY_SIZE) { + if ( !decodeSBBSWarningLogged ) { + logger.warn("Expecting a length = " + ARRAY_SIZE + " strand bias array."); + decodeSBBSWarningLogged = true; + } + return null; + } + final int[][] table = new int[ARRAY_DIM][ARRAY_DIM]; table[0][0] = array[0]; table[0][1] = array[1]; table[1][0] = array[2]; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java index 41f34b5f8..96913ceb9 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java @@ -51,18 +51,15 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.GenotypesContext; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.*; @@ -72,11 +69,11 @@ import java.util.*; *

        Strand bias is a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other. The StrandOddsRatio annotation is one of several methods that aims to evaluate whether there is strand bias in the data. It is an updated form of the Fisher Strand Test that is better at taking into account large amounts of data in high coverage situations. It is used to determine if there is strand bias between forward and reverse strands for the reference or alternate allele.

        * *

        Statistical notes

        - *

        Odds Ratios in the 2x2 contingency table below are + *

        Odds Ratios in the 2x2 contingency table below are

        * * $$ R = \frac{X[0][0] * X[1][1]}{X[0][1] * X[1][0]} $$ * - * and its inverse: + *

        and its inverse:

        * * * @@ -84,12 +81,16 @@ import java.util.*; * *
         + strand - strand
        ALT;X[1][0]X[1][1]
        * - * The sum R + 1/R is used to detect a difference in strand bias for REF and for ALT (the sum makes it symmetric). A high value is indicative of large difference where one entry is very small compared to the others. A scale factor of refRatio/altRatio where + *

        The sum R + 1/R is used to detect a difference in strand bias for REF and for ALT (the sum makes it symmetric). A high value is indicative of large difference where one entry is very small compared to the others. A scale factor of refRatio/altRatio where

        + * * $$ refRatio = \frac{max(X[0][0], X[0][1])}{min(X[0][0], X[0][1} $$ - * and + * + *

        and

        + * * $$ altRatio = \frac{max(X[1][0], X[1][1])}{min(X[1][0], X[1][1]} $$ - * ensures that the annotation value is large only. - *

        + * + *

        ensures that the annotation value is large only.

        + * *

        See the method document on statistical tests for a more detailed explanation of this statistical test.

        * *

        Related annotations

        @@ -103,8 +104,6 @@ public class StrandOddsRatio extends StrandBiasTest implements StandardAnnotatio private final static double AUGMENTATION_CONSTANT = 1.0; private static final int MIN_COUNT = 0; - private static final String SOR = "SOR"; - @Override protected Map calculateAnnotationFromGTfield(GenotypesContext genotypes){ final int[][] tableFromPerSampleAnnotations = getTableFromSamples( genotypes, MIN_COUNT ); @@ -118,7 +117,7 @@ public class StrandOddsRatio extends StrandBiasTest implements StandardAnnotatio @Override protected Map calculateAnnotationFromStratifiedContexts(Map stratifiedContexts, final VariantContext vc){ - final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), -1, MIN_COUNT); + final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAlternateAlleles(), -1, MIN_COUNT); final double ratio = calculateSOR(tableNoFiltering); return annotationForOneTable(ratio); } @@ -166,9 +165,9 @@ public class StrandOddsRatio extends StrandBiasTest implements StandardAnnotatio * @return the augmented table */ private static double[][] augmentContingencyTable(final int[][] table) { - double[][] augmentedTable = new double[2][2]; - for ( int i = 0; i < 2; i++ ) { - for ( int j = 0; j < 2; j++ ) + double[][] augmentedTable = new double[ARRAY_DIM][ARRAY_DIM]; + for ( int i = 0; i < ARRAY_DIM; i++ ) { + for ( int j = 0; j < ARRAY_DIM; j++ ) augmentedTable[i][j] = table[i][j] + AUGMENTATION_CONSTANT; } @@ -183,16 +182,16 @@ public class StrandOddsRatio extends StrandBiasTest implements StandardAnnotatio */ protected Map annotationForOneTable(final double ratio) { final Object value = String.format("%.3f", ratio); - return Collections.singletonMap(SOR, value); + return Collections.singletonMap(getKeyNames().get(0), value); } @Override public List getDescriptions() { - return Collections.singletonList(new VCFInfoHeaderLine(SOR, 1, VCFHeaderLineType.Float, "Symmetric Odds Ratio of 2x2 contingency table to detect strand bias")); + return Collections.singletonList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); } @Override public List getKeyNames() { - return Collections.singletonList(SOR); + return Collections.singletonList(GATKVCFConstants.STRAND_ODDS_RATIO_KEY); } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/TandemRepeatAnnotator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/TandemRepeatAnnotator.java index 4163f7bb7..a781d51c7 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/TandemRepeatAnnotator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/TandemRepeatAnnotator.java @@ -51,24 +51,23 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardUGAnnotation; +import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCaller; +import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import org.broadinstitute.gatk.utils.collections.Pair; -import htsjdk.variant.vcf.VCFHeaderLineCount; -import htsjdk.variant.vcf.VCFHeaderLineType; -import htsjdk.variant.vcf.VCFInfoHeaderLine; import htsjdk.variant.variantcontext.VariantContext; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** * Tandem repeat unit composition and counts per allele @@ -77,50 +76,48 @@ import java.util.Map; * *

        A tandem repeat unit is composed of one or more nucleotides that are repeated multiple times in series. Repetitive sequences are difficult to map to the reference because they are associated with multiple alignment possibilities. Knowing the number of repeat units in a set of tandem repeats tells you the number of different positions the tandem repeat can be placed in. The observation of many tandem repeat units multiplies the number of possible representations that can be made of the region. * - *

        Caveats

        + *

        Caveat

        *
          *
        • This annotation is currently not compatible with HaplotypeCaller.
        • *
        * */ -public class TandemRepeatAnnotator extends InfoFieldAnnotation implements StandardAnnotation { - private static final String STR_PRESENT = "STR"; - private static final String REPEAT_UNIT_KEY = "RU"; - private static final String REPEATS_PER_ALLELE_KEY = "RPA"; +public class TandemRepeatAnnotator extends InfoFieldAnnotation implements StandardUGAnnotation, ActiveRegionBasedAnnotation { + private final static Logger logger = Logger.getLogger(TandemRepeatAnnotator.class); + private boolean walkerIdentityCheckWarningLogged = false; + + @Override public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, final ReferenceContext ref, final Map stratifiedContexts, final VariantContext vc, - final Map stratifiedPerReadAlleleLikelihoodMap) { + final Map stratifiedPerReadAlleleLikelihoodMap) throws UserException { + if ( !vc.isIndel()) return null; - Pair,byte[]> result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, ref.getForwardBases()); + final Pair,byte[]> result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, ref.getForwardBases()); if (result == null) return null; - byte[] repeatUnit = result.second; - List numUnits = result.first; + final byte[] repeatUnit = result.second; + final List numUnits = result.first; - Map map = new HashMap(); - map.put(STR_PRESENT,true); - map.put(REPEAT_UNIT_KEY,new String(repeatUnit)); - map.put(REPEATS_PER_ALLELE_KEY, numUnits); + final Map map = new HashMap<>(); + map.put(GATKVCFConstants.STR_PRESENT_KEY, true); + map.put(GATKVCFConstants.REPEAT_UNIT_KEY, new String(repeatUnit)); + map.put(GATKVCFConstants.REPEATS_PER_ALLELE_KEY, numUnits); return map; } - protected static final String[] keyNames = {STR_PRESENT, REPEAT_UNIT_KEY,REPEATS_PER_ALLELE_KEY }; - protected static final VCFInfoHeaderLine[] descriptions = { - new VCFInfoHeaderLine(STR_PRESENT, 0, VCFHeaderLineType.Flag, "Variant is a short tandem repeat"), - new VCFInfoHeaderLine(REPEAT_UNIT_KEY, 1, VCFHeaderLineType.String, "Tandem repeat unit (bases)"), - new VCFInfoHeaderLine(REPEATS_PER_ALLELE_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Number of times tandem repeat unit is repeated, for each allele (including reference)") }; - + @Override public List getKeyNames() { - return Arrays.asList(keyNames); + return Arrays.asList( + GATKVCFConstants.STR_PRESENT_KEY, + GATKVCFConstants.REPEAT_UNIT_KEY, + GATKVCFConstants.REPEATS_PER_ALLELE_KEY); } - public List getDescriptions() { return Arrays.asList(descriptions); } - } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/TransmissionDisequilibriumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/TransmissionDisequilibriumTest.java index 430c71597..a96878a0f 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/TransmissionDisequilibriumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/TransmissionDisequilibriumTest.java @@ -51,21 +51,20 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.samples.Sample; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.RodRequiringAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.MathUtils; -import htsjdk.variant.vcf.VCFHeaderLineCount; -import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.gatk.utils.exceptions.UserException; import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.*; @@ -77,8 +76,6 @@ import java.util.*; *

        Statistical notes

        *

        The calculation is based on the derivation described in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT.

        * - *

        Note that this annotation requires a valid ped file.

        - * *

        Caveat

        *
          *
        • This annotation requires a valid pedigree file.
        • @@ -88,26 +85,46 @@ import java.util.*; */ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements RodRequiringAnnotation { - + private final static Logger logger = Logger.getLogger(TransmissionDisequilibriumTest.class); private Set trios = null; private final static int MIN_NUM_VALID_TRIOS = 5; // don't calculate this population-level statistic if there are less than X trios with full genotype likelihood information + private boolean walkerIdentityCheckWarningLogged = false; + private boolean pedigreeCheckWarningLogged = false; + @Override public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, final ReferenceContext ref, final Map stratifiedContexts, final VariantContext vc, - final Map stratifiedPerReadAlleleLikelihoodMap) { + final Map stratifiedPerReadAlleleLikelihoodMap){ + + // Can only be called from VariantAnnotator + if ( !(walker instanceof VariantAnnotator) ) { + if ( !walkerIdentityCheckWarningLogged ) { + if ( walker != null ) + logger.warn("Annotation will not be calculated, must be called from VariantAnnotator, not " + walker.getClass().getName()); + else + logger.warn("Annotation will not be calculated, must be called from VariantAnnotator"); + walkerIdentityCheckWarningLogged = true; + } + return null; + } + + // Get trios from the input pedigree file. if ( trios == null ) { - if ( walker instanceof VariantAnnotator ) { - trios = ((VariantAnnotator) walker).getSampleDB().getChildrenWithParents(); - } else { - throw new UserException("Transmission disequilibrium test annotation can only be used from the Variant Annotator and requires a valid ped file be passed in."); + trios = ((VariantAnnotator) walker).getSampleDB().getChildrenWithParents(); + if (trios == null || trios.isEmpty()) { + if ( !pedigreeCheckWarningLogged ) { + logger.warn("Transmission disequilibrium test annotation requires a valid ped file be passed in."); + pedigreeCheckWarningLogged = true; + } + return null; } } - final Map toRet = new HashMap(1); - final HashSet triosToTest = new HashSet(); + final Map toRet = new HashMap<>(1); + final HashSet triosToTest = new HashSet<>(); for( final Sample child : trios ) { final boolean hasAppropriateGenotypes = vc.hasGenotype(child.getID()) && vc.getGenotype(child.getID()).hasLikelihoods() && @@ -126,14 +143,16 @@ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implemen } // return the descriptions used for the VCF INFO meta field - public List getKeyNames() { return Arrays.asList("TDT"); } + @Override + public List getKeyNames() { return Arrays.asList(GATKVCFConstants.TRANSMISSION_DISEQUILIBRIUM_KEY); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("TDT", VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Test statistic from Wittkowski transmission disequilibrium test.")); } + @Override + public List getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); } // Following derivation in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT private List calculateTDT( final VariantContext vc, final Set triosToTest ) { - List pairwiseTDTs = new ArrayList(10); + List pairwiseTDTs = new ArrayList<>(10); final int HomRefIndex = 0; // for each pair of alleles, add the likelihoods diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantType.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantType.java index 49adb5161..c92cac17c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantType.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantType.java @@ -51,16 +51,17 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.IndelUtils; -import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.*; @@ -100,13 +101,13 @@ public class VariantType extends InfoFieldAnnotation { } } - Map map = new HashMap(); + Map map = new HashMap<>(); map.put(getKeyNames().get(0), String.format("%s", type)); return map; } - public List getKeyNames() { return Arrays.asList("VariantType"); } + public List getKeyNames() { return Arrays.asList(GATKVCFConstants.VARIANT_TYPE_KEY); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("VariantType", 1, VCFHeaderLineType.String, "Variant type description")); } + public List getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/AnalyzeCovariates.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/AnalyzeCovariates.java index 45ab38542..cd68e028a 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/AnalyzeCovariates.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/AnalyzeCovariates.java @@ -56,17 +56,17 @@ import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.Input; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.recalibration.RecalUtils; -import org.broadinstitute.gatk.utils.recalibration.RecalibrationReport; -import org.broadinstitute.gatk.utils.recalibration.BaseRecalibration; +import org.broadinstitute.gatk.engine.recalibration.RecalUtils; +import org.broadinstitute.gatk.engine.recalibration.RecalibrationReport; +import org.broadinstitute.gatk.engine.recalibration.BaseRecalibration; import java.io.File; import java.io.FileNotFoundException; @@ -76,28 +76,31 @@ import java.util.Map; /** - * Tool to analyze and evaluate base recalibration tables. + * Create plots to visualize base recalibration results + * *

          - * It generates plots to assess the quality of a recalibration run. + * This tool generates plots for visualizing the quality of a recalibration run. + *

          * *

          Input

          * - * The tool can take up to three different sets of recalibration tables. + *

          The tool can take up to three different sets of recalibration tables. * The resulting plots will be overlaid on top of each other to make - * comparisons easy. + * comparisons easy.

          * + *
          * * * * * - * + * * * * * * @@ -105,32 +108,19 @@ import java.util.Map; *
          SetArgumentLabelColorDescription
          Original-beforeBEFOREMaroon1
          Original-beforeBEFOREPinkFirst pass recalibration - * tables obtained from applying {@link BaseRecalibration} + * tables obtained from applying BaseRecalibration * on the original alignment.
          Recalibrated-afterAFTERBlueSecond pass recalibration tables - * results from the application of {@link BaseRecalibration} + * results from the application of BaseRecalibration * on the alignment recalibrated using the first pass tables
          Input-BQSRBQSRBlackAny recalibration table without a specific role
          *
          * - * You need to specify one set at least. Multiple sets need to have the same values for the following parameters: + *

          You need to specify at least one set. Multiple sets need to have the same values for the following parameters: *

          * covariate (order is not important), no_standard_covs, run_without_dbsnp, solid_recal_mode, * solid_nocall_strategy, mismatches_context_size, mismatches_default_quality, deletions_default_quality, * insertions_default_quality, maximum_cycle_value, low_quality_tail, default_platform, force_platform, * quantizing_levels and binary_tag_name + *

          + * *

          Output

          * - * Currently this tool generates two outputs: + *

          A pdf document with plots that show the quality of the recalibration, and an optional csv file that contains a table with all the data required to generate those plots.

          * - *
          - *
          -plots my-report.pdf
          - *
          A pdf document that encloses plots to assess the quality of the recalibration.
          - *
          -csv my-report.csv
          - *
          A csv file that contains a table with all the data required to generate those plots.
          - *
          - * - * You need to specify at least one of them. - * - *

          Other Arguments

          - * - *

          -ignoreLMT, --ignoreLastModificationTimes

          - * - * when set, no warning message will be displayed in the -before recalibration table file is older than the -after one. - * - *

          Examples

          + *

          Usage examples

          * * *

          Plot a single recalibration table

          @@ -142,7 +132,7 @@ import java.util.Map; * -plots BQSR.pdf * * - *

          Plot before (first pass) and after (second pass) recalibration table to compare them

          + *

          Plot before (first pass) and after (second pass) recalibration tables to compare them

          * *
            * java -jar GenomeAnalysisTK.jar \
          @@ -157,8 +147,8 @@ import java.util.Map;
            *
            * 
            *
          - * # You can ignore the before/after semantics completely if you like (if you do add -ignoreLMT
          - * # to avoid a possible warning), but all tables should have been generated using the same parameters.
          + * # You can ignore the before/after semantics completely if you like (if you do, add -ignoreLMT
          + * # to avoid a possible warning), but all tables must have been generated using the same parameters.
            *
            * java -jar GenomeAnalysisTK.jar \
            *      -T AnalyzeCovariates \
          @@ -173,31 +163,29 @@ import java.util.Map;
            * 

          Full BQSR quality assessment pipeline

          * *
          - * # Generate the first pass recalibration table file.
          + * # Generate the first pass recalibration table file
            * java -jar GenomeAnalysisTK.jar \
            *      -T BaseRecalibrator \
          - *      -R myreference.fasta \
          + *      -R reference.fasta \
            *      -I myinput.bam \
          - *      -knownSites bundle/my-trusted-snps.vcf \ # optional but recommendable
          - *      -knownSites bundle/my-trusted-indels.vcf \ # optional but recommendable
          - *      ... other options
          + *      -knownSites bundle/my-trusted-snps.vcf \ # optional but recommended
          + *      -knownSites bundle/my-trusted-indels.vcf \ # optional but recommended
            *      -o firstpass.table
            *
          - * # Generate the second pass recalibration table file.
          + * # Generate the second pass recalibration table file
            * java -jar GenomeAnalysisTK.jar \
            *      -T BaseRecalibrator \
          - *      -BQSR firstpass.table \
          - *      -R myreference.fasta \
          + *      -R reference.fasta \
            *      -I myinput.bam \
            *      -knownSites bundle/my-trusted-snps.vcf \
            *      -knownSites bundle/my-trusted-indels.vcf \
          - *      ... other options \
          + *      -BQSR firstpass.table \
            *      -o secondpass.table
            *
          - * # Finally generate the plots and also keep a copy of the csv (optional).
          + * # Finally generate the plots and also keep a copy of the csv (optional)
            * java -jar GenomeAnalysisTK.jar \
            *      -T AnalyzeCovariates \
          - *      -R myrefernce.fasta \
          + *      -R reference.fasta \
            *      -before firstpass.table \
            *      -after secondpass.table \
            *      -csv BQSR.csv \ # optional
          @@ -251,14 +239,14 @@ public final class AnalyzeCovariates extends RodWalker
          +     *
                * This field value is resolved by {@link #initialize()}.
                */
               protected File bqsrFile = null;
           
               /**
                * Checks inputs and argument values.
          -     * 

          + * * Notice that this routine will not validate the content of files. It may have some minor side effects as * the output of warning messages back to the user. * @@ -370,7 +358,6 @@ public final class AnalyzeCovariates extends RodWalker * If plotsFile is null, it does not perform any plotting. * * @param csvFile the intermediary csv file. @@ -453,9 +440,9 @@ public final class AnalyzeCovariates extends RodWalker + * * The key is the role and the value the corresponding report file. - *

          + * * Roles: "Before" (recalibration), "After" (recalibration), "BQSR" (the tool standard argument recalibration file) * * @return never null @@ -523,7 +510,7 @@ public final class AnalyzeCovariates extends RodWalker + * * This is the the one specified by the user if any or a temporary file * that will be deleted as soon as the VM exists by default. * diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRGatherer.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRGatherer.java deleted file mode 100644 index d9f59d856..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRGatherer.java +++ /dev/null @@ -1,139 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.tools.walkers.bqsr; - -import org.apache.commons.collections.CollectionUtils; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.commandline.Gatherer; -import org.broadinstitute.gatk.engine.report.GATKReport; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.recalibration.RecalibrationReport; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.PrintStream; -import java.util.*; - -/** - * User: carneiro - * Date: 3/29/11 - */ - - -public class BQSRGatherer extends Gatherer { - - private static final Logger logger = Logger.getLogger(BQSRGatherer.class); - private static final String EMPTY_INPUT_LIST = "list of inputs files is empty or there is no usable data in any input file"; - private static final String MISSING_OUTPUT_FILE = "missing output file name"; - private static final String MISSING_READ_GROUPS = "Missing read group(s)"; - - @Override - public void gather(final List inputs, final File output) { - final PrintStream outputFile; - try { - outputFile = new PrintStream(output); - } catch(FileNotFoundException e) { - throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE); - } - final GATKReport report = gatherReport(inputs); - report.print(outputFile); - } - - /** - * Gathers the input recalibration reports into a single report. - * - * @param inputs Input recalibration GATK reports - * @return gathered recalibration GATK report - */ - public static GATKReport gatherReport(final List inputs) { - final SortedSet allReadGroups = new TreeSet(); - final LinkedHashMap> inputReadGroups = new LinkedHashMap>(); - - // Get the read groups from each input report - for (final File input : inputs) { - final Set readGroups = RecalibrationReport.getReadGroups(input); - inputReadGroups.put(input, readGroups); - allReadGroups.addAll(readGroups); - } - - // Log the read groups that are missing from specific inputs - for (Map.Entry> entry: inputReadGroups.entrySet()) { - final File input = entry.getKey(); - final Set readGroups = entry.getValue(); - if (allReadGroups.size() != readGroups.size()) { - // Since this is not completely unexpected, more than debug, but less than a proper warning. - logger.info(MISSING_READ_GROUPS + ": " + input.getAbsolutePath()); - for (final Object readGroup: CollectionUtils.subtract(allReadGroups, readGroups)) { - logger.info(" " + readGroup); - } - } - } - - RecalibrationReport generalReport = null; - for (File input : inputs) { - final RecalibrationReport inputReport = new RecalibrationReport(input, allReadGroups); - if( inputReport.isEmpty() ) { continue; } - - if (generalReport == null) - generalReport = inputReport; - else - generalReport.combine(inputReport); - } - if (generalReport == null) - throw new ReviewedGATKException(EMPTY_INPUT_LIST); - - generalReport.calculateQuantizedQualities(); - - return generalReport.createGATKReport(); - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java index fd87c7f31..2c6744f97 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java @@ -55,15 +55,16 @@ import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.CigarElement; import htsjdk.samtools.SAMFileHeader; import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.engine.recalibration.*; import org.broadinstitute.gatk.engine.walkers.*; import org.broadinstitute.gatk.utils.commandline.Advanced; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.engine.filters.*; import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.baq.BAQ; @@ -74,7 +75,7 @@ import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; import org.broadinstitute.gatk.utils.recalibration.*; -import org.broadinstitute.gatk.utils.recalibration.covariates.Covariate; +import org.broadinstitute.gatk.engine.recalibration.covariates.Covariate; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.ReadUtils; @@ -85,49 +86,49 @@ import java.util.Arrays; import java.util.List; /** - * First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context). + * Generate base recalibration table to compensate for systematic errors * *

          - * This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating + * This tool is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating * only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative - * of poor base quality. This walker generates tables based on various user-specified covariates (such as read group, - * reported quality score, cycle, and context). Since there is a large amount of data one can then calculate an empirical + * of poor base quality. This tool generates tables based on various user-specified covariates (such as read group, + * reported quality score, cycle, and context). Since there is a large amount of data, one can then calculate an empirical * probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations. * The output file is a table (of the several covariate values, num observations, num mismatches, empirical quality score). + *

          *

          - * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified. - * - *

          + * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added regardless of whether + * or not they were specified. + *

          * *

          Input

          *

          - * The input read data whose base quality scores need to be assessed. + * A BAM file containing data that needs to be recalibrated. *

          - * A database of known polymorphic sites to skip over. + * A database of known polymorphic sites to mask out. *

          * *

          Output

          - *

          - * A GATK Report file with many tables: - *

            + *

            A GATKReport file with many tables:

            + *
              *
            • The list of arguments
            • *
            • The quantized qualities table
            • *
            • The recalibration table by read group
            • *
            • The recalibration table by quality score
            • *
            • The recalibration table for all the optional covariates
            • - *
          - * - * The GATK Report is intended to be easy to read by humans or computers. Check out the documentation of the GATKReport to learn how to manipulate this table. + *
        + *

        + * The GATKReport table format is intended to be easy to read by both humans and computer languages (especially R). + * Check out the documentation of the GATKReport (in the FAQs) to learn how to manipulate this table. *

        * - *

        Examples

        + *

        Usage example

        *
        - * java -Xmx4g -jar GenomeAnalysisTK.jar \
        + * java -jar GenomeAnalysisTK.jar \
          *   -T BaseRecalibrator \
        + *   -R reference.fasta \
          *   -I my_reads.bam \
        - *   -R resources/Homo_sapiens_assembly18.fasta \
        - *   -knownSites bundle/hg18/dbsnp_132.hg18.vcf \
        - *   -knownSites another/optional/setOfSitesToMask.vcf \
        + *   -knownSites latest_dbsnp.vcf \
          *   -o recal_data.table
          * 
        */ @@ -138,16 +139,16 @@ import java.util.List; @PartitionBy(PartitionType.READ) public class BaseRecalibrator extends ReadWalker implements NanoSchedulable { /** - * all the command line arguments for BQSR and it's covariates + * all the command line arguments for BQSR and its covariates */ @ArgumentCollection private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); /** - * When you have nct > 1, BQSR uses nct times more memory to compute its recalibration tables, for efficiency + * When you use nct > 1, BQSR uses nct times more memory to compute its recalibration tables, for efficiency * purposes. If you have many covariates, and therefore are using a lot of memory, you can use this flag * to safely access only one table. There may be some CPU cost, but as long as the table is really big - * there should be relatively little CPU costs. + * the cost should be relatively reasonable. */ @Argument(fullName = "lowMemoryMode", shortName="lowMemoryMode", doc="Reduce memory usage in multi-threaded code at the expense of threading efficiency", required = false) public boolean lowMemoryMode = false; @@ -170,7 +171,7 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche private int minimumQToUse; - private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation."; + private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to mask out known variant sites. Please provide a VCF file containing known sites of genetic variation."; private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector private IndexedFastaSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation @@ -312,6 +313,12 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche final boolean[] knownSites = new boolean[readLength]; Arrays.fill(knownSites, false); for( final Feature f : features ) { + if ((f.getStart() < read.getSoftStart() && f.getEnd() < read.getSoftStart()) || + (f.getStart() > read.getSoftEnd() && f.getEnd() > read.getSoftEnd())) { + // feature is outside clipping window for the read, ignore + continue; + } + int featureStartOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getStart(), ReadUtils.ClippingTail.LEFT_TAIL, true); // BUGBUG: should I use LEFT_TAIL here? if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { featureStartOnRead = 0; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/ReadRecalibrationInfo.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/ReadRecalibrationInfo.java index 53d26c176..c3914216d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/ReadRecalibrationInfo.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/ReadRecalibrationInfo.java @@ -55,7 +55,7 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.gatk.utils.QualityUtils; import org.broadinstitute.gatk.utils.recalibration.EventType; -import org.broadinstitute.gatk.utils.recalibration.ReadCovariates; +import org.broadinstitute.gatk.engine.recalibration.ReadCovariates; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; /** diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/RecalibrationArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/RecalibrationArgumentCollection.java deleted file mode 100644 index 4bc4af2e4..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/RecalibrationArgumentCollection.java +++ /dev/null @@ -1,420 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.tools.walkers.bqsr; - -import com.google.java.contract.Requires; -import htsjdk.tribble.Feature; -import org.broadinstitute.gatk.utils.commandline.*; -import org.broadinstitute.gatk.engine.report.GATKReportTable; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.exceptions.GATKException; -import org.broadinstitute.gatk.utils.recalibration.RecalUtils; - -import java.io.File; -import java.io.PrintStream; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Nov 27, 2009 - * - * A collection of the arguments that are used for BQSR. Used to be common to both CovariateCounterWalker and TableRecalibrationWalker. - * This set of arguments will also be passed to the constructor of every Covariate when it is instantiated. - */ - -public class RecalibrationArgumentCollection implements Cloneable { - - /** - * This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference, - * so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites. This tool accepts any number of RodBindings (VCF, Bed, etc.) - * for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites. - * Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument. - */ - @Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false) - public List> knownSites = Collections.emptyList(); - - /** - * After the header, data records occur one per line until the end of the file. The first several items on a line are the - * values of the individual covariates and will change depending on which covariates were specified at runtime. The last - * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches, - * and the raw empirical quality score calculated by phred-scaling the mismatch rate. Use '/dev/stdout' to print to standard out. - */ - @Gather(BQSRGatherer.class) - @Output(doc = "The output recalibration table file to create", required = true) - public File RECAL_TABLE_FILE = null; - public PrintStream RECAL_TABLE; - - /** - * Note that the --list argument requires a fully resolved and correct command-line to work. - */ - @Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false) - public boolean LIST_ONLY = false; - - /** - * Note that the ReadGroup and QualityScore covariates are required and do not need to be specified. - * Also, unless --no_standard_covs is specified, the Cycle and Context covariates are standard and are included by default. - * Use the --list argument to see the available covariates. - */ - @Argument(fullName = "covariate", shortName = "cov", doc = "One or more covariates to be used in the recalibration. Can be specified multiple times", required = false) - public String[] COVARIATES = null; - - /* - * The Cycle and Context covariates are standard and are included by default unless this argument is provided. - * Note that the ReadGroup and QualityScore covariates are required and cannot be excluded. - */ - @Argument(fullName = "no_standard_covs", shortName = "noStandard", doc = "Do not use the standard set of covariates, but rather just the ones listed using the -cov argument", required = false) - public boolean DO_NOT_USE_STANDARD_COVARIATES = false; - - /** - * This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option. - */ - @Advanced - @Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.") - public boolean RUN_WITHOUT_DBSNP = false; - - /** - * BaseRecalibrator accepts a --solid_recal_mode flag which governs how the recalibrator handles the - * reads which have had the reference inserted because of color space inconsistencies. - */ - @Argument(fullName = "solid_recal_mode", shortName = "sMode", required = false, doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS") - public RecalUtils.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalUtils.SOLID_RECAL_MODE.SET_Q_ZERO; - - /** - * BaseRecalibrator accepts a --solid_nocall_strategy flag which governs how the recalibrator handles - * no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in - * their color space tag can not be recalibrated. - */ - @Argument(fullName = "solid_nocall_strategy", shortName = "solid_nocall_strategy", doc = "Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required = false) - public RecalUtils.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalUtils.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION; - - /** - * The context covariate will use a context of this size to calculate its covariate value for base mismatches. Must be between 1 and 13 (inclusive). Note that higher values will increase runtime and required java heap size. - */ - @Argument(fullName = "mismatches_context_size", shortName = "mcs", doc = "Size of the k-mer context to be used for base mismatches", required = false) - public int MISMATCHES_CONTEXT_SIZE = 2; - - /** - * The context covariate will use a context of this size to calculate its covariate value for base insertions and deletions. Must be between 1 and 13 (inclusive). Note that higher values will increase runtime and required java heap size. - */ - @Argument(fullName = "indels_context_size", shortName = "ics", doc = "Size of the k-mer context to be used for base insertions and deletions", required = false) - public int INDELS_CONTEXT_SIZE = 3; - - /** - * The cycle covariate will generate an error if it encounters a cycle greater than this value. - * This argument is ignored if the Cycle covariate is not used. - */ - @Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "The maximum cycle value permitted for the Cycle covariate", required = false) - public int MAXIMUM_CYCLE_VALUE = 500; - - /** - * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is off] - */ - @Argument(fullName = "mismatches_default_quality", shortName = "mdq", doc = "default quality for the base mismatches covariate", required = false) - public byte MISMATCHES_DEFAULT_QUALITY = -1; - - /** - * A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used for all reads without insertion quality scores for each base. [default is on] - */ - @Argument(fullName = "insertions_default_quality", shortName = "idq", doc = "default quality for the base insertions covariate", required = false) - public byte INSERTIONS_DEFAULT_QUALITY = 45; - - /** - * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is on] - */ - @Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false) - public byte DELETIONS_DEFAULT_QUALITY = 45; - - /** - * Reads with low quality bases on either tail (beginning or end) will not be considered in the context. This parameter defines the quality below which (inclusive) a tail is considered low quality - */ - @Argument(fullName = "low_quality_tail", shortName = "lqt", doc = "minimum quality for the bases in the tail of the reads to be considered", required = false) - public byte LOW_QUAL_TAIL = 2; - - /** - * BQSR generates a quantization table for quick quantization later by subsequent tools. BQSR does not quantize the base qualities, this is done by the engine with the -qq or -BQSR options. - * This parameter tells BQSR the number of levels of quantization to use to build the quantization table. - */ - @Argument(fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output") - public int QUANTIZING_LEVELS = 16; - - /** - * The tag name for the binary tag covariate (if using it) - */ - @Argument(fullName = "binary_tag_name", shortName = "bintag", required = false, doc = "the binary tag covariate name if using it") - public String BINARY_TAG_NAME = null; - - /* - * whether GATK report tables should have rows in sorted order, starting from leftmost column - */ - @Argument(fullName = "sort_by_all_columns", shortName = "sortAllCols", doc = "Sort the rows in the tables of reports", required = false) - public Boolean SORT_BY_ALL_COLUMNS = false; - - ///////////////////////////// - // Debugging-only Arguments - ///////////////////////////// - - @Hidden - @Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") - public String DEFAULT_PLATFORM = null; - - @Hidden - @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") - public String FORCE_PLATFORM = null; - - @Hidden - @Argument(fullName = "force_readgroup", shortName = "fRG", required = false, doc = "If provided, the read group of EVERY read will be forced to be the provided String.") - public String FORCE_READGROUP = null; - - @Hidden - @Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only", defaultToStdout = false) - public PrintStream RECAL_TABLE_UPDATE_LOG = null; - - /** - * The repeat covariate will use a context of this size to calculate it's covariate value for base insertions and deletions - */ - @Hidden - @Argument(fullName = "max_str_unit_length", shortName = "maxstr", doc = "Max size of the k-mer context to be used for repeat covariates", required = false) - public int MAX_STR_UNIT_LENGTH = 8; - - @Hidden - @Argument(fullName = "max_repeat_length", shortName = "maxrep", doc = "Max number of repetitions to be used for repeat covariates", required = false) - public int MAX_REPEAT_LENGTH = 20; - - - public File existingRecalibrationReport = null; - - public GATKReportTable generateReportTable(final String covariateNames) { - GATKReportTable argumentsTable; - if(SORT_BY_ALL_COLUMNS) { - argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2, GATKReportTable.TableSortingWay.SORT_BY_COLUMN); - } else { - argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2); - } - argumentsTable.addColumn("Argument"); - argumentsTable.addColumn(RecalUtils.ARGUMENT_VALUE_COLUMN_NAME); - argumentsTable.addRowID("covariate", true); - argumentsTable.set("covariate", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, covariateNames); - argumentsTable.addRowID("no_standard_covs", true); - argumentsTable.set("no_standard_covs", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DO_NOT_USE_STANDARD_COVARIATES); - argumentsTable.addRowID("run_without_dbsnp", true); - argumentsTable.set("run_without_dbsnp", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, RUN_WITHOUT_DBSNP); - argumentsTable.addRowID("solid_recal_mode", true); - argumentsTable.set("solid_recal_mode", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_RECAL_MODE); - argumentsTable.addRowID("solid_nocall_strategy", true); - argumentsTable.set("solid_nocall_strategy", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_NOCALL_STRATEGY); - argumentsTable.addRowID("mismatches_context_size", true); - argumentsTable.set("mismatches_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_CONTEXT_SIZE); - argumentsTable.addRowID("indels_context_size", true); - argumentsTable.set("indels_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INDELS_CONTEXT_SIZE); - argumentsTable.addRowID("mismatches_default_quality", true); - argumentsTable.set("mismatches_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_DEFAULT_QUALITY); - argumentsTable.addRowID("deletions_default_quality", true); - argumentsTable.set("deletions_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DELETIONS_DEFAULT_QUALITY); - argumentsTable.addRowID("insertions_default_quality", true); - argumentsTable.set("insertions_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_DEFAULT_QUALITY); - argumentsTable.addRowID("maximum_cycle_value", true); - argumentsTable.set("maximum_cycle_value", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MAXIMUM_CYCLE_VALUE); - argumentsTable.addRowID("low_quality_tail", true); - argumentsTable.set("low_quality_tail", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, LOW_QUAL_TAIL); - argumentsTable.addRowID("default_platform", true); - argumentsTable.set("default_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DEFAULT_PLATFORM); - argumentsTable.addRowID("force_platform", true); - argumentsTable.set("force_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM); - argumentsTable.addRowID("quantizing_levels", true); - argumentsTable.set("quantizing_levels", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS); - argumentsTable.addRowID("recalibration_report", true); - argumentsTable.set("recalibration_report", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, existingRecalibrationReport == null ? "null" : existingRecalibrationReport.getAbsolutePath()); - argumentsTable.addRowID("binary_tag_name", true); - argumentsTable.set("binary_tag_name", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, BINARY_TAG_NAME == null ? "null" : BINARY_TAG_NAME); - return argumentsTable; - } - - /** - * Returns a map with the arguments that differ between this an - * another {@link RecalibrationArgumentCollection} instance. - *

        - * The key is the name of that argument in the report file. The value is a message - * that explains the difference to the end user. - *

        - * Thus, a empty map indicates that there is no differences between both argument collection that - * is relevant to report comparison. - *

        - * This method should not throw any exception. - * - * @param other the argument-collection to compare against. - * @param thisRole the name used to refer to this RAC report that makes sense to the end user. - * @param otherRole the name used to refer to the other RAC report that makes sense to the end user. - * - * @return never null, but a zero-size collection if there are no differences. - */ - @Requires("other != null && thisRole != null && otherRole != null && !thisRole.equalsIgnoreCase(otherRole)") - Map compareReportArguments(final RecalibrationArgumentCollection other,final String thisRole, final String otherRole) { - final Map result = new LinkedHashMap<>(15); - compareRequestedCovariates(result, other, thisRole, otherRole); - compareSimpleReportArgument(result,"no_standard_covs", DO_NOT_USE_STANDARD_COVARIATES, other.DO_NOT_USE_STANDARD_COVARIATES, thisRole, otherRole); - compareSimpleReportArgument(result,"run_without_dbsnp",RUN_WITHOUT_DBSNP,other.RUN_WITHOUT_DBSNP,thisRole,otherRole); - compareSimpleReportArgument(result,"solid_recal_mode", SOLID_RECAL_MODE, other.SOLID_RECAL_MODE,thisRole,otherRole); - compareSimpleReportArgument(result,"solid_nocall_strategy", SOLID_NOCALL_STRATEGY, other.SOLID_NOCALL_STRATEGY,thisRole,otherRole); - compareSimpleReportArgument(result,"mismatches_context_size", MISMATCHES_CONTEXT_SIZE,other.MISMATCHES_CONTEXT_SIZE,thisRole,otherRole); - compareSimpleReportArgument(result,"mismatches_default_quality", MISMATCHES_DEFAULT_QUALITY, other.MISMATCHES_DEFAULT_QUALITY,thisRole,otherRole); - compareSimpleReportArgument(result,"deletions_default_quality", DELETIONS_DEFAULT_QUALITY, other.DELETIONS_DEFAULT_QUALITY,thisRole,otherRole); - compareSimpleReportArgument(result,"insertions_default_quality", INSERTIONS_DEFAULT_QUALITY, other.INSERTIONS_DEFAULT_QUALITY,thisRole,otherRole); - compareSimpleReportArgument(result,"maximum_cycle_value", MAXIMUM_CYCLE_VALUE, other.MAXIMUM_CYCLE_VALUE,thisRole,otherRole); - compareSimpleReportArgument(result,"low_quality_tail", LOW_QUAL_TAIL, other.LOW_QUAL_TAIL,thisRole,otherRole); - compareSimpleReportArgument(result,"default_platform", DEFAULT_PLATFORM, other.DEFAULT_PLATFORM,thisRole,otherRole); - compareSimpleReportArgument(result,"force_platform", FORCE_PLATFORM, other.FORCE_PLATFORM,thisRole,otherRole); - compareSimpleReportArgument(result,"quantizing_levels", QUANTIZING_LEVELS, other.QUANTIZING_LEVELS,thisRole,otherRole); - compareSimpleReportArgument(result,"binary_tag_name", BINARY_TAG_NAME, other.BINARY_TAG_NAME,thisRole,otherRole); - return result; - } - - - /** - * Compares the covariate report lists. - * - * @param diffs map where to annotate the difference. - * @param other the argument collection to compare against. - * @param thisRole the name for this argument collection that makes sense to the user. - * @param otherRole the name for the other argument collection that makes sense to the end user. - * - * @return true if a difference was found. - */ - @Requires("diffs != null && other != null && thisRole != null && otherRole != null") - private boolean compareRequestedCovariates(final Map diffs, - final RecalibrationArgumentCollection other, final String thisRole, final String otherRole) { - - final Set beforeNames = new HashSet<>(this.COVARIATES.length); - final Set afterNames = new HashSet<>(other.COVARIATES.length); - Utils.addAll(beforeNames, this.COVARIATES); - Utils.addAll(afterNames,other.COVARIATES); - final Set intersect = new HashSet<>(Math.min(beforeNames.size(),afterNames.size())); - intersect.addAll(beforeNames); - intersect.retainAll(afterNames); - - String diffMessage = null; - if (intersect.size() == 0) { // In practice this is not possible due to required covariates but... - diffMessage = String.format("There are no common covariates between '%s' and '%s'" - + " recalibrator reports. Covariates in '%s': {%s}. Covariates in '%s': {%s}.",thisRole,otherRole, - thisRole,Utils.join(", ",this.COVARIATES), - otherRole,Utils.join(",",other.COVARIATES)); - } else if (intersect.size() != beforeNames.size() || intersect.size() != afterNames.size()) { - beforeNames.removeAll(intersect); - afterNames.removeAll(intersect); - diffMessage = String.format("There are differences in the set of covariates requested in the" - + " '%s' and '%s' recalibrator reports. " - + " Exclusive to '%s': {%s}. Exclusive to '%s': {%s}.",thisRole,otherRole, - thisRole,Utils.join(", ",beforeNames), - otherRole,Utils.join(", ",afterNames)); - } - if (diffMessage != null) { - diffs.put("covariate",diffMessage); - return true; - } else { - return false; - } - } - - /** - * Annotates a map with any difference encountered in a simple value report argument that differs between this an - * another {@link RecalibrationArgumentCollection} instance. - *

        - * The key of the new entry would be the name of that argument in the report file. The value is a message - * that explains the difference to the end user. - *

        - * - *

        - * This method should not return any exception. - * - * @param diffs where to annotate the differences. - * @param name the name of the report argument to compare. - * @param thisValue this argument collection value for that argument. - * @param otherValue the other collection value for that argument. - * @param thisRole the name used to refer to this RAC report that makes sense to the end user. - * @param otherRole the name used to refer to the other RAC report that makes sense to the end user. - * - * @type T the argument Object value type. - * - * @return true if a difference has been spotted, thus diff has been modified. - */ - private boolean compareSimpleReportArgument(final Map diffs, - final String name, final T thisValue, final T otherValue, final String thisRole, final String otherRole) { - if (thisValue == null && otherValue == null) { - return false; - } else if (thisValue != null && thisValue.equals(otherValue)) { - return false; - } else { - diffs.put(name, - String.format("differences between '%s' {%s} and '%s' {%s}.", - thisRole,thisValue == null ? "" : thisValue, - otherRole,otherValue == null ? "" : otherValue)); - return true; - } - - } - - /** - * Create a shallow copy of this argument collection. - * - * @return never null. - */ - @Override - public RecalibrationArgumentCollection clone() { - try { - return (RecalibrationArgumentCollection) super.clone(); - } catch (CloneNotSupportedException e) { - throw new GATKException("Unreachable code clone not supported thrown when the class " - + this.getClass().getName() + " is cloneable ",e); - } - } - -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/RecalibrationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/RecalibrationEngine.java index 52a34aa54..aa20c9656 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/RecalibrationEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/RecalibrationEngine.java @@ -52,9 +52,13 @@ package org.broadinstitute.gatk.tools.walkers.bqsr; import com.google.java.contract.Requires; +import org.broadinstitute.gatk.engine.recalibration.ReadCovariates; +import org.broadinstitute.gatk.engine.recalibration.RecalDatum; +import org.broadinstitute.gatk.engine.recalibration.RecalUtils; +import org.broadinstitute.gatk.engine.recalibration.RecalibrationTables; import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; import org.broadinstitute.gatk.utils.recalibration.*; -import org.broadinstitute.gatk.utils.recalibration.covariates.Covariate; +import org.broadinstitute.gatk.engine.recalibration.covariates.Covariate; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import java.io.PrintStream; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/BaseCoverageDistribution.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/BaseCoverageDistribution.java index e4735505b..81c080d17 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/BaseCoverageDistribution.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/BaseCoverageDistribution.java @@ -54,10 +54,10 @@ package org.broadinstitute.gatk.tools.walkers.diagnostics; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.report.GATKReport; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.report.GATKReport; import org.broadinstitute.gatk.engine.walkers.LocusWalker; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; @@ -71,17 +71,16 @@ import java.util.LinkedList; import java.util.Map; /** - * Simple walker to plot the coverage distribution per base + * Evaluate coverage distribution per base * *

        - * Features of this walker: - *

      • includes a smart counting of uncovered bases without visiting the uncovered loci
      • - *
      • includes reads with deletions in the loci (optionally can be turned off)
      • + * This tool reports the distribution of coverage per base. It includes reads with deletions in the counts unless + * otherwise specified. Quality filters can be applied before the coverage is calculated. *

        * *

        Input

        *

        - * The BAM file and an optional interval list (works for WGS as well) + * The BAM file and an optional interval list *

        * *

        Output

        @@ -89,13 +88,13 @@ import java.util.Map; * A GATK Report with the coverage distribution per base * *

        - *

        Examples

        + *

        Usage example

        *
        - * java -Xmx4g -jar GenomeAnalysisTK.jar \
        - *   -R ref.fasta \
        + * java -jar GenomeAnalysisTK.jar \
        + *   -R reference.fasta \
          *   -T BaseCoverageDistribution \
          *   -I myData.bam \
        - *   -L interesting.intervals \
        + *   -L intervals.list \
          *   -fd \
          *   -o report.grp
          * 
        @@ -106,34 +105,34 @@ import java.util.Map; @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) public class BaseCoverageDistribution extends LocusWalker, Map>> { /** - * The output GATK Report table + * The name of the file to output the GATK Report table. See the FAQs for more information on the GATK Report format. */ - @Output(doc = "The output GATK Report table") + @Output(doc = "Output filename") private PrintStream out; /** * Whether or not a deletion should be counted towards the coverage of a site */ - @Argument(required = false, shortName="del", fullName = "include_deletions", doc ="whether or not to include reads with deletions on the loci in the pileup") + @Argument(required = false, shortName="del", fullName = "include_deletions", doc ="Include reads with deletions") private boolean includeDeletions = true; /** - * Whether or not to calculate and output a filtered coverage distribution. Bases will be filtered according to the + * Whether or not to apply quality filters before calculating coverage distribution. Filtering will use the * minimum_mapping_quality and minimum_base_quality parameters below. */ - @Argument(required = false, shortName="fd", fullName = "filtered_distribution", doc ="calculate and report the filtered coverage distribution of bases") + @Argument(required = false, shortName="fd", fullName = "filtered_distribution", doc ="Apply quality filters") private boolean calculateFilteredDistribution = false; /** * The minimum mapping quality a read must have to be counted towards the filtered coverage of a site */ - @Argument(required = false, shortName="mmq", fullName = "minimum_mapping_quality", doc ="minimum mapping quality of a read to include it in the filtered coverage distribution") + @Argument(required = false, shortName="mmq", fullName = "minimum_mapping_quality", doc ="Minimum read mapping quality of a read to pass filters") private byte minMappingQuality = 20; /** * The minimum base quality a base must have to be counted towards the filtered coverage of a site */ - @Argument(required = false, shortName="mbq", fullName = "minimum_base_quality", doc ="minimum base quality of a base to include it in the filtered coverage distribution") + @Argument(required = false, shortName="mbq", fullName = "minimum_base_quality", doc ="Minimum base quality to pass filters") private byte minBaseQuality = 17; private GenomeLoc previousLocus = null; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/FindCoveredIntervals.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/FindCoveredIntervals.java index 18fe381a3..6f1718430 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/FindCoveredIntervals.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/FindCoveredIntervals.java @@ -54,9 +54,9 @@ package org.broadinstitute.gatk.tools.walkers.diagnostics; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.ActiveRegionTraversalParameters; import org.broadinstitute.gatk.engine.walkers.ActiveRegionWalker; import org.broadinstitute.gatk.engine.walkers.PartitionBy; @@ -69,9 +69,10 @@ import org.broadinstitute.gatk.utils.help.HelpConstants; import java.io.PrintStream; /** - * Outputs a list of intervals that are covered above a given threshold. + * Outputs a list of intervals that are covered above a given threshold * - *

        The list can be used as an interval list for other walkers. Note that if the -uncovered argument is given, the tool will instead output intervals that fail the coverage threshold.

        + *

        The output list can be used as an interval list for other tools. Note that if the -uncovered argument is given, the + * logic will be inverted and the tool will instead output intervals that fail the coverage threshold.

        * *

        Input

        *

        @@ -85,16 +86,16 @@ import java.io.PrintStream; * *

        Example

        *
        - * java -Xmx2g -jar GenomeAnalysisTK.jar \
        + * java -jar GenomeAnalysisTK.jar \
          *   -T FindCoveredIntervals \
        - *   -R ref.fasta \
        + *   -R reference.fasta \
          *   -I my_file.bam \
          *   -o output.list
          * 
        * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) -@PartitionBy(PartitionType.CONTIG) +@PartitionBy(value = PartitionType.CONTIG) @ActiveRegionTraversalParameters(extension = 0, maxRegion = 50000) public class FindCoveredIntervals extends ActiveRegionWalker { @Output diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java index 9240c56e3..56097e625 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java @@ -56,12 +56,11 @@ import org.broadinstitute.gatk.engine.walkers.*; import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.SampleUtils; import org.broadinstitute.gatk.utils.classloader.PluginManager; import org.broadinstitute.gatk.utils.exceptions.DynamicClassResolutionException; import org.broadinstitute.gatk.utils.exceptions.UserException; @@ -70,49 +69,44 @@ import org.broadinstitute.gatk.utils.help.HelpConstants; import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.*; +import org.broadinstitute.gatk.utils.sam.ReadUtils; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.io.PrintStream; import java.util.*; /** - * Analyzes coverage distribution and validates read mates for a given interval and sample. - *

        + * Analyze coverage distribution and validate read mates per interval and per sample + * *

        - * Used to diagnose regions with bad coverage, mapping, or read mating. Analyzes each sample independently in addition - * to interval wide analysis. + * This tool is useful for diagnosing regions with bad coverage, mapping, or read mate pairs. It analyzes each sample + * independently and aggregates results over intervals of interest. *

        - *

        - *

        *

        Input

        - *

        *

          *
        • A reference file
        • *
        • one or more input BAMs
        • *
        • One or more intervals
        • *
        - *

        - *

        *

        Output

        *

        * A modified VCF detailing each interval by sample and information for each interval according to the thresholds used. - * Interval information includes GC Content, average interval depth, callable status among others. - * - * If you use the --missing option, you can get as a second output a intervals file with the loci that have missing data. + * Interval information includes GC Content, average interval depth, callable status among others. If you use the + * --missing option, you can get as a second output a intervals file with the loci that have missing data. * This file can then be used as input to QualifyMissingIntervals for full qualification and interpretation of why * the data is missing. *

        - *

        - *

        Examples

        + *

        Usage example

        *
        - *    java
        - *      -jar GenomeAnalysisTK.jar
        + *    java -jar GenomeAnalysisTK.jar
          *              -T DiagnoseTargets \
          *              -R reference.fasta \
        - *              -o output.vcf \
          *              -I sample1.bam \
          *              -I sample2.bam \
          *              -I sample3.bam \
        - *              -L intervals.interval_list
        + *              -L intervals.interval_list \
        + *              -o output.vcf
          *  
        * * @author Mauricio Carneiro, Roger Zurawicki @@ -124,12 +118,6 @@ import java.util.*; @Downsample(by = DownsampleType.NONE) public class DiagnoseTargets extends LocusWalker { - private static final String AVG_INTERVAL_DP_KEY = "IDP"; - private static final String LOW_COVERAGE_LOCI = "LL"; - private static final String ZERO_COVERAGE_LOCI = "ZL"; - private static final String GC_CONTENT_KEY = "GC"; - - @Output(doc = "File to which interval statistics should be written") private VariantContextWriter vcfWriter = null; @@ -150,11 +138,11 @@ public class DiagnoseTargets extends LocusWalker { if (getToolkit().getIntervals() == null || getToolkit().getIntervals().isEmpty()) throw new UserException("This tool only works if you provide one or more intervals (use the -L argument). If you want to run whole genome, use -T DepthOfCoverage instead."); - intervalMap = new LinkedHashMap(INITIAL_HASH_SIZE); - intervalListIterator = new PeekableIterator(getToolkit().getIntervals().iterator()); + intervalMap = new LinkedHashMap<>(INITIAL_HASH_SIZE); + intervalListIterator = new PeekableIterator<>(getToolkit().getIntervals().iterator()); // get all of the unique sample names for the VCF Header - samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + samples = ReadUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples)); // pre load all the statistics classes because it is costly to operate on the JVM and we only want to do it once. @@ -224,7 +212,7 @@ public class DiagnoseTargets extends LocusWalker { */ private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) { // output any intervals that were finished - final List toRemove = new LinkedList(); + final List toRemove = new LinkedList<>(); for (GenomeLoc key : intervalMap.keySet()) { if (key.isBefore(refLocus)) { final IntervalStratification intervalStats = intervalMap.get(key); @@ -263,17 +251,17 @@ public class DiagnoseTargets extends LocusWalker { private void outputStatsToVCF(final IntervalStratification stats, final Allele refAllele) { GenomeLoc interval = stats.getInterval(); - final List alleles = new ArrayList(); - final Map attributes = new HashMap(); - final ArrayList genotypes = new ArrayList(); + final List alleles = new ArrayList<>(); + final Map attributes = new HashMap<>(); + final ArrayList genotypes = new ArrayList<>(); for (String sample : samples) { final GenotypeBuilder gb = new GenotypeBuilder(sample); SampleStratification sampleStat = stats.getSampleStatistics(sample); - gb.attribute(AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage(interval.size())); - gb.attribute(LOW_COVERAGE_LOCI, sampleStat.getNLowCoveredLoci()); - gb.attribute(ZERO_COVERAGE_LOCI, sampleStat.getNUncoveredLoci()); + gb.attribute(GATKVCFConstants.AVG_INTERVAL_DP_BY_SAMPLE_KEY, sampleStat.averageCoverage(interval.size())); + gb.attribute(GATKVCFConstants.LOW_COVERAGE_LOCI, sampleStat.getNLowCoveredLoci()); + gb.attribute(GATKVCFConstants.ZERO_COVERAGE_LOCI, sampleStat.getNUncoveredLoci()); gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false)); genotypes.add(gb.make()); @@ -283,11 +271,11 @@ public class DiagnoseTargets extends LocusWalker { VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles); vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); - vcb.filters(new LinkedHashSet(statusToStrings(stats.callableStatuses(), true))); + vcb.filters(new LinkedHashSet<>(statusToStrings(stats.callableStatuses(), true))); attributes.put(VCFConstants.END_KEY, interval.getStop()); - attributes.put(AVG_INTERVAL_DP_KEY, stats.averageCoverage(interval.size())); - attributes.put(GC_CONTENT_KEY, stats.gcContent()); + attributes.put(GATKVCFConstants.AVG_INTERVAL_DP_KEY, stats.averageCoverage(interval.size())); + attributes.put(GATKVCFConstants.INTERVAL_GC_CONTENT_KEY, stats.gcContent()); vcb = vcb.attributes(attributes); vcb = vcb.genotypes(genotypes); @@ -347,7 +335,7 @@ public class DiagnoseTargets extends LocusWalker { * @return a matching set of strings */ private List statusToStrings(Iterable statuses, final boolean isInfoField) { - List output = new LinkedList(); + List output = new LinkedList<>(); for (CallableStatus status : statuses) if ( isInfoField || status != CallableStatus.PASS ) @@ -398,19 +386,19 @@ public class DiagnoseTargets extends LocusWalker { * @return A set of VCF header lines */ private static Set getHeaderInfo() { - Set headerLines = new HashSet(); + Set headerLines = new HashSet<>(); // INFO fields for overall data headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); - headerLines.add(new VCFInfoHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size.")); - headerLines.add(new VCFInfoHeaderLine(GC_CONTENT_KEY, 1, VCFHeaderLineType.Float, "GC Content of the interval")); + headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.AVG_INTERVAL_DP_KEY)); + headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.INTERVAL_GC_CONTENT_KEY)); headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); // FORMAT fields for each genotype headerLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)); - headerLines.add(new VCFFormatHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average sample depth across the interval. Sum of the sample specific depth in all loci divided by interval size.")); - headerLines.add(new VCFFormatHeaderLine(LOW_COVERAGE_LOCI, 1, VCFHeaderLineType.Integer, "Number of loci for this sample, in this interval with low coverage (below the minimum coverage) but not zero.")); - headerLines.add(new VCFFormatHeaderLine(ZERO_COVERAGE_LOCI, 1, VCFHeaderLineType.Integer, "Number of loci for this sample, in this interval with zero coverage.")); + headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.AVG_INTERVAL_DP_BY_SAMPLE_KEY)); + headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.LOW_COVERAGE_LOCI)); + headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.ZERO_COVERAGE_LOCI)); // FILTER fields for (CallableStatus stat : CallableStatus.values()) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/IntervalStratification.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/IntervalStratification.java index 715ee5b05..4261eee4c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/IntervalStratification.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/IntervalStratification.java @@ -51,8 +51,8 @@ package org.broadinstitute.gatk.tools.walkers.diagnostics.diagnosetargets; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/missing/QualifyMissingIntervals.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/missing/QualifyMissingIntervals.java index ef83c71e3..36cf28696 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/missing/QualifyMissingIntervals.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/missing/QualifyMissingIntervals.java @@ -56,11 +56,11 @@ import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.Gather; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.report.GATKReport; -import org.broadinstitute.gatk.engine.report.GATKReportGatherer; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.report.GATKReport; +import org.broadinstitute.gatk.utils.report.GATKReportGatherer; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.GenomeLocSortedSet; @@ -74,9 +74,9 @@ import java.io.PrintStream; import java.util.List; /** - * Walks along reference and calculates a few metrics for each interval. + * Collect quality metrics for a set of intervals * - * Metrics: + *

        This tool collects the following metrics:

        *
          *
        • Average Base Quality
        • *
        • Average Mapping Quality
        • @@ -88,9 +88,11 @@ import java.util.List; *
        • Length of the uncovered interval
        • *
        * + *

        It is meant to be run on a set of intervals that have been identified as problematic in earlier stages of quality control and are considered "missing" from the sequence dataset.

        + * *

        Input

        *

        - * A reference file (for GC content), the input bam file (for base and mapping quality calculation), the missing intervals (in the -L), the baits/targets used to sequence (in the -targets) and a bed file with the coding sequence intervals of the genome (in the -cds) + * A reference file (for GC content), the input bam file (for base and mapping quality calculation), the missing intervals (in the -L), the baits/targets used to sequence (in the -targets) and a bed file with the coding sequence intervals of the genome (in the -cds). *

        * *

        Output

        @@ -98,11 +100,11 @@ import java.util.List; * GC content, distance from the end of the target, coding sequence intersection, mapping and base quality averages and average depth per "missing" interval. *

        * - *

        Example

        + *

        Usage example

        *
        - * java -Xmx2g -jar GenomeAnalysisTK.jar \
        + * java -jar GenomeAnalysisTK.jar \
          *   -T QualifyMissingIntervals \
        - *   -R ref.fasta \
        + *   -R reference.fasta \
          *   -I input.bam \
          *   -o output.grp \
          *   -L input.intervals \
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleLikelihoodMatrixMapper.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleLikelihoodMatrixMapper.java
        index 4993d5614..54ea7da0a 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleLikelihoodMatrixMapper.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleLikelihoodMatrixMapper.java
        @@ -52,6 +52,7 @@
         package org.broadinstitute.gatk.tools.walkers.genotyper;
         
         import htsjdk.variant.variantcontext.Allele;
        +import org.broadinstitute.gatk.utils.genotyper.AlleleListPermutation;
         import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
         import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
         
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/ConsensusAlleleCounter.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/ConsensusAlleleCounter.java
        index 53523472a..96f432dc1 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/ConsensusAlleleCounter.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/ConsensusAlleleCounter.java
        @@ -52,11 +52,10 @@
         package org.broadinstitute.gatk.tools.walkers.genotyper;
         
         import org.apache.log4j.Logger;
        -import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
        -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils;
        -import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
        +import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
        +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
        +import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
         import org.broadinstitute.gatk.utils.GenomeLoc;
        -import org.broadinstitute.gatk.utils.GenomeLocParser;
         import org.broadinstitute.gatk.utils.clipping.ReadClipper;
         import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
         import org.broadinstitute.gatk.utils.collections.Pair;
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/ErrorModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/ErrorModel.java
        index 60799caca..71587fe3e 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/ErrorModel.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/ErrorModel.java
        @@ -52,7 +52,7 @@
         package org.broadinstitute.gatk.tools.walkers.genotyper;
         
         import com.google.java.contract.Requires;
        -import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
        +import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
         import org.broadinstitute.gatk.tools.walkers.indels.PairHMMIndelErrorModel;
         import org.broadinstitute.gatk.utils.haplotype.Haplotype;
         import org.broadinstitute.gatk.utils.MathUtils;
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java
        index 349734b07..984215df3 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java
        @@ -54,7 +54,6 @@ package org.broadinstitute.gatk.tools.walkers.genotyper;
         import htsjdk.samtools.SAMUtils;
         import htsjdk.variant.variantcontext.Allele;
         import htsjdk.variant.variantcontext.GenotypeLikelihoods;
        -import htsjdk.variant.vcf.VCFConstants;
         import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.ExactACcounts;
         import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.ExactACset;
         import org.broadinstitute.gatk.utils.MathUtils;
        @@ -62,6 +61,7 @@ import org.broadinstitute.gatk.utils.collections.Pair;
         import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
         import org.broadinstitute.gatk.utils.exceptions.UserException;
         import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
        +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
         
         import java.util.*;
         
        @@ -319,7 +319,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
                     iterator.next();
                 }
                 if (VERBOSE) {
        -            System.out.println(VCFConstants.MLE_ALLELE_COUNT_KEY + ": " + Arrays.toString(mlInd));
        +            System.out.println(GATKVCFConstants.MLE_ALLELE_COUNT_KEY + ": " + Arrays.toString(mlInd));
                 }
                 return new Pair(mlInd,maxVal);
             }
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java
        index 03b3e3374..49e49d82d 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java
        @@ -52,19 +52,19 @@
         package org.broadinstitute.gatk.tools.walkers.genotyper;
         
         import org.apache.log4j.Logger;
        -import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
        -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils;
        -import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
        -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
        +import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
        +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
        +import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
        +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
         import org.broadinstitute.gatk.utils.GenomeLoc;
         import org.broadinstitute.gatk.utils.GenomeLocParser;
         import org.broadinstitute.gatk.utils.MathUtils;
         import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
        -import htsjdk.variant.vcf.VCFConstants;
         import org.broadinstitute.gatk.utils.collections.Pair;
         import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
         import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
         import htsjdk.variant.variantcontext.*;
        +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
         import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
         
         import java.util.*;
        @@ -287,7 +287,7 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G
                 final HashMap attributes = new HashMap();
         
                 if (UAC.referenceSampleName != null && perLaneErrorModels != null)
        -            attributes.put(VCFConstants.REFSAMPLE_DEPTH_KEY, ErrorModel.getTotalReferenceDepth(perLaneErrorModels));
        +            attributes.put(GATKVCFConstants.REFSAMPLE_DEPTH_KEY, ErrorModel.getTotalReferenceDepth(perLaneErrorModels));
         
                 builder.attributes(attributes);
                 // create the genotypes; no-call everyone for now
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java
        index 0964a74ab..3a65a3a9e 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java
        @@ -51,7 +51,7 @@
         
         package org.broadinstitute.gatk.tools.walkers.genotyper;
         
        -import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
        +import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
         import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.ExactACset;
         import org.broadinstitute.gatk.tools.walkers.indels.PairHMMIndelErrorModel;
         import org.broadinstitute.gatk.utils.haplotype.Haplotype;
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java
        index 693982b2f..c0e2ea95e 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java
        @@ -52,10 +52,10 @@
         package org.broadinstitute.gatk.tools.walkers.genotyper;
         
         import org.apache.log4j.Logger;
        -import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
        -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils;
        -import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
        -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
        +import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
        +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
        +import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
        +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
         import org.broadinstitute.gatk.tools.walkers.indels.PairHMMIndelErrorModel;
         import org.broadinstitute.gatk.utils.*;
         import org.broadinstitute.gatk.utils.haplotype.Haplotype;
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java
        index ecb66cdf9..93b7524db 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java
        @@ -77,10 +77,10 @@ package org.broadinstitute.gatk.tools.walkers.genotyper;
         
         
         import org.apache.log4j.Logger;
        -import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
        -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils;
        -import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
        -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
        +import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
        +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
        +import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
        +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
         import org.broadinstitute.gatk.utils.*;
         import org.broadinstitute.gatk.utils.BaseUtils;
         import org.broadinstitute.gatk.utils.gga.GenotypingGivenAllelesUtils;
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java
        index c3208ce8b..b32f291f9 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java
        @@ -52,10 +52,10 @@
         package org.broadinstitute.gatk.tools.walkers.genotyper;
         
         import org.apache.log4j.Logger;
        -import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
        -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils;
        -import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
        -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
        +import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
        +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
        +import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
        +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
         import org.broadinstitute.gatk.utils.BaseUtils;
         import org.broadinstitute.gatk.utils.GenomeLocParser;
         import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingData.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingData.java
        index 1f73c7140..c73690a84 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingData.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingData.java
        @@ -52,7 +52,10 @@
         package org.broadinstitute.gatk.tools.walkers.genotyper;
         
         import htsjdk.variant.variantcontext.Allele;
        +import org.broadinstitute.gatk.utils.genotyper.AlleleList;
         import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
        +import org.broadinstitute.gatk.utils.genotyper.SampleList;
        +import org.broadinstitute.gatk.utils.genotyper.SampleListUtils;
         
         /**
          * Encapsulates the data use to make the genotype calls.
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java
        index 60254fdc8..8dd616f41 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java
        @@ -54,15 +54,13 @@ package org.broadinstitute.gatk.tools.walkers.genotyper;
         import com.google.java.contract.Ensures;
         import com.google.java.contract.Requires;
         import htsjdk.variant.variantcontext.*;
        -import htsjdk.variant.vcf.VCFConstants;
        -import htsjdk.variant.vcf.VCFHeaderLineType;
         import htsjdk.variant.vcf.VCFInfoHeaderLine;
         import org.apache.log4j.Logger;
        -import org.broadinstitute.gatk.engine.arguments.StandardCallerArgumentCollection;
        -import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
        -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils;
        -import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
        -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
        +import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
        +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
        +import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
        +import org.broadinstitute.gatk.utils.genotyper.SampleList;
        +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
         import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine;
         import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculator;
         import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculationResult;
        @@ -74,6 +72,8 @@ import org.broadinstitute.gatk.utils.QualityUtils;
         import org.broadinstitute.gatk.utils.exceptions.UserException;
         import org.broadinstitute.gatk.utils.gga.GenotypingGivenAllelesUtils;
         import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
        +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
        +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
         import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
         
         import java.util.*;
        @@ -85,10 +85,6 @@ import java.util.*;
          */
         public abstract class GenotypingEngine {
         
        -    public static final String NUMBER_OF_DISCOVERED_ALLELES_KEY = "NDA";
        -
        -    public static final String LOW_QUAL_FILTER_NAME = "LowQual";
        -
             protected final AFCalculatorProvider afCalculatorProvider   ;
         
             protected Logger logger;
        @@ -157,7 +153,7 @@ public abstract class GenotypingEngine getAppropriateVCFInfoHeaders() {
                 Set headerInfo = new HashSet<>();
                 if ( configuration.genotypeArgs.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED )
        -            headerInfo.add(new VCFInfoHeaderLine(UnifiedGenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, 1, VCFHeaderLineType.Integer, "Number of alternate alleles discovered (but not necessarily genotyped) at this site"));
        +            headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.NUMBER_OF_DISCOVERED_ALLELES_KEY));
                 return headerInfo;
             }
         
        @@ -261,7 +257,7 @@ public abstract class GenotypingEngine inputPriors) {
         
        -        final double[] priors = new double[N + 1];
        -        double sum = 0.0;
        -        final AFPriorProvider result;
        -
                 if (!inputPriors.isEmpty()) {
                     // user-specified priors
                     if (inputPriors.size() != N)
        @@ -651,17 +643,17 @@ public abstract class GenotypingEngine 0 ) {
        -            attributes.put(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE);
        +            attributes.put(GATKVCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE);
                     final ArrayList MLEfrequencies = calculateMLEAlleleFrequencies(alleleCountsofMLE, genotypes);
        -            attributes.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies);
        +            attributes.put(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies);
                 }
         
                 if ( configuration.genotypeArgs.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED )
        -            attributes.put(NUMBER_OF_DISCOVERED_ALLELES_KEY, vc.getAlternateAlleles().size());
        +            attributes.put(GATKVCFConstants.NUMBER_OF_DISCOVERED_ALLELES_KEY, vc.getAlternateAlleles().size());
         
         
                 return attributes;
        @@ -673,7 +665,7 @@ public abstract class GenotypingEngine MLEfrequencies = new ArrayList(alleleCountsofMLE.size());
        +        final ArrayList MLEfrequencies = new ArrayList<>(alleleCountsofMLE.size());
                 // the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT is ./. but the exact model may arbitrarily choose an AC>1)
                 for (final int AC : alleleCountsofMLE )
                     MLEfrequencies.add(Math.min(1.0, (double)AC / (double)AN));
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingLikelihoods.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingLikelihoods.java
        index 873318532..f06a40b73 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingLikelihoods.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingLikelihoods.java
        @@ -53,6 +53,8 @@ package org.broadinstitute.gatk.tools.walkers.genotyper;
         
         import htsjdk.variant.variantcontext.Allele;
         import htsjdk.variant.variantcontext.GenotypeLikelihoods;
        +import org.broadinstitute.gatk.utils.genotyper.AlleleList;
        +import org.broadinstitute.gatk.utils.genotyper.SampleList;
         
         import java.util.List;
         
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingModel.java
        index f2413f122..4dfb8d312 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingModel.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingModel.java
        @@ -52,6 +52,7 @@
         package org.broadinstitute.gatk.tools.walkers.genotyper;
         
         import htsjdk.variant.variantcontext.Allele;
        +import org.broadinstitute.gatk.utils.genotyper.AlleleList;
         
         /**
          * Common interface for genotyping models.
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/HomogeneousPloidyModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/HomogeneousPloidyModel.java
        index bc7691e46..dee370eec 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/HomogeneousPloidyModel.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/HomogeneousPloidyModel.java
        @@ -51,6 +51,8 @@
         
         package org.broadinstitute.gatk.tools.walkers.genotyper;
         
        +import org.broadinstitute.gatk.utils.genotyper.SampleList;
        +
         /**
         * {@link PloidyModel} implementation tailored to work with a homogeneous constant ploidy
         * across samples and positions.
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
        index 10ffeffd2..7ee4a9aca 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
        @@ -52,10 +52,10 @@
         package org.broadinstitute.gatk.tools.walkers.genotyper;
         
         import org.apache.log4j.Logger;
        -import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
        -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils;
        -import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
        -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
        +import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
        +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
        +import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
        +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
         import org.broadinstitute.gatk.tools.walkers.indels.PairHMMIndelErrorModel;
         import org.broadinstitute.gatk.utils.BaseUtils;
         import org.broadinstitute.gatk.utils.GenomeLoc;
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/InfiniteRandomMatingPopulationModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/InfiniteRandomMatingPopulationModel.java
        index 320ccebe7..1589e8374 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/InfiniteRandomMatingPopulationModel.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/InfiniteRandomMatingPopulationModel.java
        @@ -53,6 +53,9 @@ package org.broadinstitute.gatk.tools.walkers.genotyper;
         
         import htsjdk.variant.variantcontext.Allele;
         import htsjdk.variant.variantcontext.GenotypeLikelihoods;
        +import org.broadinstitute.gatk.utils.genotyper.AlleleList;
        +import org.broadinstitute.gatk.utils.genotyper.AlleleListPermutation;
        +import org.broadinstitute.gatk.utils.genotyper.AlleleListUtils;
         import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
         
         import java.util.ArrayList;
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/PloidyModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/PloidyModel.java
        index e21494985..1ad1a2241 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/PloidyModel.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/PloidyModel.java
        @@ -51,6 +51,8 @@
         
         package org.broadinstitute.gatk.tools.walkers.genotyper;
         
        +import org.broadinstitute.gatk.utils.genotyper.SampleList;
        +
         /**
          * Information about the number of chromosome per sample at a given location.
          *
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
        index 39a37b642..9f8e88fac 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
        @@ -52,10 +52,10 @@
         package org.broadinstitute.gatk.tools.walkers.genotyper;
         
         import org.apache.log4j.Logger;
        -import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
        -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils;
        -import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
        -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
        +import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
        +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
        +import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
        +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
         import org.broadinstitute.gatk.utils.BaseUtils;
         import org.broadinstitute.gatk.utils.GenomeLoc;
         import org.broadinstitute.gatk.utils.GenomeLocParser;
        @@ -69,6 +69,7 @@ import org.broadinstitute.gatk.utils.pileup.PileupElement;
         import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
         import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl;
         import htsjdk.variant.variantcontext.*;
        +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
         import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
         
         import java.util.*;
        @@ -202,7 +203,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
                     gb.DP(sampleData.depth);
                     gb.alleles(noCall);
                     if (UAC.annotateAllSitesWithPLs)
        -                gb.attribute(UnifiedGenotypingEngine.PL_FOR_ALL_SNP_ALLELES_KEY,GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(allLikelihoods, false, true)));
        +                gb.attribute(GATKVCFConstants.PL_FOR_ALL_SNP_ALLELES_KEY,GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(allLikelihoods, false, true)));
                     genotypes.add(gb.make());
                 }
         
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/StandardCallerArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/StandardCallerArgumentCollection.java
        new file mode 100644
        index 000000000..b2bd306fc
        --- /dev/null
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/StandardCallerArgumentCollection.java
        @@ -0,0 +1,231 @@
        +/*
        +* By downloading the PROGRAM you agree to the following terms of use:
        +* 
        +* BROAD INSTITUTE
        +* SOFTWARE LICENSE AGREEMENT
        +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
        +* 
        +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
        +* 
        +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
        +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
        +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
        +* 
        +* 1. DEFINITIONS
        +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
        +* 
        +* 2. LICENSE
        +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute.  LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
        +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
        +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
        +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
        +* 
        +* 3. PHONE-HOME FEATURE
        +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM.  Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time.  Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
        +* 
        +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
        +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
        +* Copyright 2012-2014 Broad Institute, Inc.
        +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
        +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
        +* 
        +* 5. INDEMNIFICATION
        +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
        +* 
        +* 6. NO REPRESENTATIONS OR WARRANTIES
        +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
        +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
        +* 
        +* 7. ASSIGNMENT
        +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
        +* 
        +* 8. MISCELLANEOUS
        +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
        +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
        +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
        +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
        +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
        +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
        +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
        +*/
        +
        +package org.broadinstitute.gatk.tools.walkers.genotyper;
        +
        +import org.broadinstitute.gatk.engine.arguments.GenotypeCalculationArgumentCollection;
        +import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorImplementation;
        +import org.broadinstitute.gatk.utils.commandline.*;
        +import org.broadinstitute.gatk.utils.collections.DefaultHashMap;
        +import htsjdk.variant.variantcontext.VariantContext;
        +
        +import java.io.File;
        +import java.lang.reflect.Field;
        +import java.lang.reflect.Method;
        +import java.lang.reflect.Modifier;
        +import java.util.Collections;
        +import java.util.Map;
        +
        +/**
        + * Created with IntelliJ IDEA.
        + * User: rpoplin
        + * Date: 8/20/12
        + * A collection of arguments that are common to the various callers.
        + * This is pulled out so that every caller isn't exposed to the arguments from every other caller.
        + */
        +
        +public class StandardCallerArgumentCollection implements Cloneable {
        +
        +    @ArgumentCollection
        +    public GenotypeCalculationArgumentCollection genotypeArgs = new GenotypeCalculationArgumentCollection();
        +
        +    @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false)
        +    public GenotypingOutputMode genotypingOutputMode = GenotypingOutputMode.DISCOVERY;
        +
        +    /**
        +     * When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding
        +     */
        +    @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", required=false)
        +    public RodBinding alleles;
        +
        +    /**
        +     * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
        +     * Basically, it will ignore the contamination fraction of reads for each alternate allele.  So if the pileup contains N total bases, then we
        +     * will try to remove (N * contamination fraction) bases for each alternate allele.
        +     */
        +    @Argument(fullName = "contamination_fraction_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false)
        +    public double CONTAMINATION_FRACTION = DEFAULT_CONTAMINATION_FRACTION;
        +    public static final double DEFAULT_CONTAMINATION_FRACTION = 0.0;
        +
        +    /**
        +     *  This argument specifies a file with two columns "sample" and "contamination" specifying the contamination level for those samples.
        +     *  Samples that do not appear in this file will be processed with CONTAMINATION_FRACTION.
        +     **/
        +    @Advanced
        +    @Argument(fullName = "contamination_fraction_per_sample_file", shortName = "contaminationFile", doc = "Tab-separated File containing fraction of contamination in sequencing data (per sample) to aggressively remove. Format should be \"\" (Contamination is double) per line; No header.", required = false)
        +    public File CONTAMINATION_FRACTION_FILE = null;
        +
        +    /**
        +     * Indicates whether there is some sample contamination present.
        +     */
        +    private boolean sampleContaminationWasLoaded = false;
        +
        +    /**
        +     *
        +     * @return an _Immutable_ copy of the Sample-Contamination Map, defaulting to CONTAMINATION_FRACTION so that if the sample isn't in the map map(sample)==CONTAMINATION_FRACTION
        +     */
        +    public Map getSampleContamination(){
        +        //make sure that the default value is set up right
        +        sampleContamination.setDefaultValue(CONTAMINATION_FRACTION);
        +        if (!Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0)
        +            sampleContaminationWasLoaded = true;
        +        return Collections.unmodifiableMap(sampleContamination);
        +    }
        +
        +    public void setSampleContamination(DefaultHashMap sampleContamination) {
        +        this.sampleContamination.clear();
        +        this.sampleContaminationWasLoaded = !Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0;
        +        if (!sampleContaminationWasLoaded)
        +            for (final Double d : sampleContamination.values())
        +                if (!Double.isNaN(d) && d > 0.0) {
        +                    sampleContaminationWasLoaded = true;
        +                    break;
        +                }
        +        this.sampleContamination.putAll(sampleContamination);
        +        this.sampleContamination.setDefaultValue(CONTAMINATION_FRACTION);
        +    }
        +
        +    /**
        +     * Returns true if there is some sample contamination present, false otherwise.
        +     * @return {@code true} iff there is some sample contamination
        +     */
        +    public boolean isSampleContaminationPresent() {
        +        return (!Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0) || sampleContaminationWasLoaded;
        +   }
        +
        +    //Needs to be here because it uses CONTAMINATION_FRACTION
        +    private DefaultHashMap sampleContamination = new DefaultHashMap(CONTAMINATION_FRACTION);
        +
        +    /**
        +     * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus.
        +     */
        +    @Hidden
        +    @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
        +    public AFCalculatorImplementation requestedAlleleFrequencyCalculationModel;
        +
        +    @Hidden
        +    @Argument(shortName = "logExactCalls", doc="x", required=false)
        +    public File exactCallsLog = null;
        +
        +    @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false)
        +    public OutputMode outputMode = OutputMode.EMIT_VARIANTS_ONLY;
        +
        +    /**
        +     * Advanced, experimental argument: if SNP likelihood model is specified, and if EMIT_ALL_SITES output mode is set, when we set this argument then we will also emit PLs at all sites.
        +     * This will give a measure of reference confidence and a measure of which alt alleles are more plausible (if any).
        +     * WARNINGS:
        +     * - This feature will inflate VCF file size considerably.
        +     * - All SNP ALT alleles will be emitted with corresponding 10 PL values.
        +     * - An error will be emitted if EMIT_ALL_SITES is not set, or if anything other than diploid SNP model is used
        +     */
        +    @Advanced
        +    @Argument(fullName = "allSitePLs", shortName = "allSitePLs", doc = "Annotate all sites with PLs", required = false)
        +    public boolean annotateAllSitesWithPLs = false;
        +
        +    /**
        +     * Creates a Standard caller argument collection with default values.
        +     */
        +    public StandardCallerArgumentCollection() { }
        +
        +    /**
        +     * "Casts" a caller argument collection into another type.
        +     *
        +     * 

        Common fields values are copied across

        + * @param clazz the class of the result. + * @param result argument collection class. + * @return never {@code null}. + */ + public T cloneTo(final Class clazz) { + // short cut: just use regular clone if it happens to be the same class. + if (clazz == getClass()) + return (T) clone(); + try { + final T result = clazz.newInstance(); + for (final Field field : getClass().getFields()) { + // just copy common fields. + if (!field.getDeclaringClass().isAssignableFrom(clazz)) + continue; + final int fieldModifiers = field.getModifiers(); + if ((fieldModifiers & UNCOPYABLE_MODIFIER_MASK) != 0) continue; + //Use the clone() method if appropriate + if (Cloneable.class.isAssignableFrom(field.getType())) { + Method clone = field.getType().getMethod("clone"); + field.set(result, clone.invoke(field.get(this))); + } else + field.set(result,field.get(this)); + } + return result; + } catch (final Exception ex) { + throw new IllegalStateException(ex); + } + } + + /** + * Creates a copy of this configuration. + * @return never {@code null}. + */ + @Override + public StandardCallerArgumentCollection clone() { + try { + StandardCallerArgumentCollection cloned = (StandardCallerArgumentCollection) super.clone(); + cloned.genotypeArgs = genotypeArgs.clone(); + return cloned; + } catch (CloneNotSupportedException e) { + throw new IllegalStateException("unreachable code"); + } + } + + /** + * Holds a modifiers mask that identifies those fields that cannot be copied between + * StandardCallerArgumentCollections. + */ + private final int UNCOPYABLE_MODIFIER_MASK = Modifier.PRIVATE | Modifier.STATIC | Modifier.FINAL; +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedArgumentCollection.java index cc3ba9353..08e13da1b 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedArgumentCollection.java @@ -52,7 +52,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import org.broadinstitute.gatk.utils.commandline.*; -import org.broadinstitute.gatk.engine.arguments.StandardCallerArgumentCollection; +import org.broadinstitute.gatk.tools.walkers.genotyper.StandardCallerArgumentCollection; import org.broadinstitute.gatk.utils.pairhmm.PairHMM; import htsjdk.variant.variantcontext.VariantContext; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyper.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyper.java index 21f445f7e..bfcfd91c2 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyper.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyper.java @@ -58,40 +58,42 @@ import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.downsampling.AlleleBiasedDownsamplingUtils; -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.downsampling.AlleleBiasedDownsamplingUtils; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; import org.broadinstitute.gatk.engine.filters.BadMateFilter; import org.broadinstitute.gatk.engine.filters.MappingQualityUnavailableFilter; import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.genotyper.IndexedSampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.*; import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvider; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.FixedAFCalculatorProvider; -import org.broadinstitute.gatk.utils.SampleUtils; import org.broadinstitute.gatk.utils.baq.BAQ; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; +import org.broadinstitute.gatk.utils.sam.ReadUtils; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import java.io.PrintStream; import java.util.*; /** - * A variant caller which unifies the approaches of several disparate callers -- Works for single-sample and multi-sample data. + * Call SNPs and indels on a per-locus basis * *

        - * The GATK Unified Genotyper is a multiple-sample, technology-aware SNP and indel caller. It uses a Bayesian genotype - * likelihood model to estimate simultaneously the most likely genotypes and allele frequency in a population of N samples, - * emitting an accurate posterior probability of there being a segregating variant allele at each locus as well as for the - * genotype of each sample. The system can either emit just the variant sites or complete genotypes (which includes - * homozygous reference calls) satisfying some phred-scaled confidence value. The genotyper can make accurate calls on - * both single sample data and multi-sample data. + * This tool uses a Bayesian genotype likelihood model to estimate simultaneously the most likely genotypes and + * allele frequency in a population of N samples, emitting a genotype for each sample. The system can either emit + * just the variant sites or complete genotypes (which includes homozygous reference calls) satisfying some + * phred-scaled confidence value. *

        * *

        Input

        @@ -104,54 +106,50 @@ import java.util.*; * A raw, unfiltered, highly sensitive callset in VCF format. *

        * - *

        Example generic command for multi-sample SNP calling

        + *

        Usage examples

        + *

        Multi-sample SNP calling

        *
          * java -jar GenomeAnalysisTK.jar \
        - *   -R resources/Homo_sapiens_assembly18.fasta \
          *   -T UnifiedGenotyper \
        + *   -R reference.fasta \
          *   -I sample1.bam [-I sample2.bam ...] \
          *   --dbsnp dbSNP.vcf \
          *   -o snps.raw.vcf \
          *   -stand_call_conf [50.0] \
          *   -stand_emit_conf 10.0 \
        - *   -dcov [50 for 4x, 200 for >30x WGS or Whole exome] \
          *   [-L targets.interval_list]
          * 
        * - *

        - * The above command will call all of the samples in your provided BAM files [-I arguments] together and produce a VCF file - * with sites and genotypes for all samples. The easiest way to get the dbSNP file is from the GATK resource bundle (see Guide FAQs for details). Several - * arguments have parameters that should be chosen based on the average coverage per sample in your data. See the detailed - * argument descriptions below. - *

        - * - *

        Example command for generating calls at all sites

        + *

        Generate calls at all sites

        *
        - * java -jar /path/to/GenomeAnalysisTK.jar \
        - *   -l INFO \
        - *   -R resources/Homo_sapiens_assembly18.fasta \
        + * java -jar GenomeAnalysisTK.jar \
          *   -T UnifiedGenotyper \
        - *   -I /DCC/ftp/pilot_data/data/NA12878/alignment/NA12878.SLX.maq.SRP000031.2009_08.bam \
        - *   -o my.vcf \
        + *   -R reference.fasta \
        + *   -I input.bam \
        + *   -o raw_variants.vcf \
          *   --output_mode EMIT_ALL_SITES
          * 
        * *

        Caveats

        *
          - *
        • The system is under active and continuous development. All outputs, the underlying likelihood model, arguments, and - * file formats are likely to change.
        • - *
        • The system can be very aggressive in calling variants. In the 1000 genomes project for pilot 2 (deep coverage of ~35x) - * we expect the raw Qscore > 50 variants to contain at least ~10% FP calls. We use extensive post-calling filters to eliminate - * most of these FPs. Variant Quality Score Recalibration is a tool to perform this filtering.
        • - *
        • The generalized ploidy model can be used to handle non-diploid or pooled samples (see the -ploidy argument in the table below).
        • + *
        • The caller can be very aggressive in calling variants in order to be very sensitive, so the raw output will + * contain many false positives. We use extensive post-calling filters to eliminate most of these FPs. See the documentation on filtering (especially by Variant Quality Score Recalibration) for more details.
        • + *
        • This tool has been deprecated in favor of HaplotypeCaller, a much more sophisticated variant caller that + * produces much better calls, especially on indels, and includes features that allow it to scale to much larger + * cohort sizes.
        • *
        * + *

        Special note on ploidy

        + *

        This tool is able to handle almost any ploidy (except very high ploidies in large pooled experiments); the ploidy + * can be specified using the -ploidy argument for non-diploid organisms.

        + * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.ON_INPUT) @ReadFilters( {BadMateFilter.class, MappingQualityUnavailableFilter.class} ) @Reference(window=@Window(start=-200,stop=200)) +@PartitionBy(value = PartitionType.LOCUS, includeUnmapped = false) @By(DataSource.REFERENCE) // TODO -- When LocusIteratorByState gets cleaned up, we should enable multiple @By sources: // TODO -- @By( {DataSource.READS, DataSource.REFERENCE_ORDERED_DATA} ) @@ -221,7 +219,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif * Keep in mind that RODRequiringAnnotations are not intended to be used as a group, because they require specific ROD inputs. */ @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls. The single value 'none' removes the default group", required=false) - protected String[] annotationClassesToUse = { "Standard" }; + protected String[] annotationClassesToUse = { "Standard", "StandardUG" }; // the calculation arguments private UnifiedGenotypingEngine genotypingEngine = null; @@ -267,7 +265,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif sampleNameSet = Collections.singleton(GenotypeLikelihoodsCalculationModel.DUMMY_SAMPLE_NAME); } else { // get all of the unique sample names - sampleNameSet = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()); + sampleNameSet = ReadUtils.getSAMFileSamples(toolkit.getSAMFileHeader()); if ( UAC.referenceSampleName != null ) sampleNameSet.remove(UAC.referenceSampleName); } @@ -334,20 +332,19 @@ public class UnifiedGenotyper extends LocusWalker, Unif // add the pool values for each genotype if (UAC.genotypeArgs.samplePloidy != GATKVariantContextUtils.DEFAULT_PLOIDY) { - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the alternate allele count, in the same order as listed, for each individual sample")); - headerInfo.add(new VCFFormatHeaderLine(VCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the alternate allele fraction, in the same order as listed, for each individual sample")); + headerInfo.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.MLE_PER_SAMPLE_ALLELE_COUNT_KEY)); + headerInfo.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.MLE_PER_SAMPLE_ALLELE_FRACTION_KEY)); } if (UAC.referenceSampleName != null) { - headerInfo.add(new VCFInfoHeaderLine(VCFConstants.REFSAMPLE_DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Total reference sample depth")); + headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.REFSAMPLE_DEPTH_KEY)); } if (UAC.annotateAllSitesWithPLs) { - headerInfo.add(new VCFFormatHeaderLine(UnifiedGenotypingEngine.PL_FOR_ALL_SNP_ALLELES_KEY, 10, VCFHeaderLineType.Integer, "Phred-scaled genotype likelihoods for all 4 possible bases regardless of whether there is statistical evidence for them. Ordering is always PL for AA AC CC GA GC GG TA TC TG TT.")); + headerInfo.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.PL_FOR_ALL_SNP_ALLELES_KEY)); } - VCFStandardHeaderLines.addStandardInfoLines(headerInfo, true, - VCFConstants.DOWNSAMPLED_KEY, - VCFConstants.MLE_ALLELE_COUNT_KEY, - VCFConstants.MLE_ALLELE_FREQUENCY_KEY); + headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.DOWNSAMPLED_KEY)); + headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_COUNT_KEY)); + headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY)); // also, check to see whether comp rods were included if ( dbsnp != null && dbsnp.dbsnp.isBound() ) @@ -362,7 +359,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif // FILTER fields are added unconditionally as it's not always 100% certain the circumstances // where the filters are used. For example, in emitting all sites the lowQual field is used - headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotypingEngine.LOW_QUAL_FILTER_NAME, "Low quality")); + headerInfo.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.LOW_QUAL_FILTER_NAME)); return headerInfo; } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java index 6b6b66062..b51f96735 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java @@ -56,10 +56,11 @@ import htsjdk.variant.variantcontext.GenotypesContext; import htsjdk.variant.variantcontext.VariantContext; import org.apache.log4j.Logger; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculationResult; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvider; import org.broadinstitute.gatk.utils.BaseUtils; @@ -82,8 +83,6 @@ import java.util.*; */ public class UnifiedGenotypingEngine extends GenotypingEngine { - public static final String PL_FOR_ALL_SNP_ALLELES_KEY = "APL"; - private static final int SNP_MODEL = 0; private static final int INDEL_MODEL = 1; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculatorPerformanceTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculatorPerformanceTest.java index 69175a29c..e97b9ca79 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculatorPerformanceTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculatorPerformanceTest.java @@ -54,8 +54,8 @@ package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc; import org.apache.log4j.ConsoleAppender; import org.apache.log4j.Logger; import org.apache.log4j.TTCCLayout; -import org.broadinstitute.gatk.engine.report.GATKReport; -import org.broadinstitute.gatk.engine.report.GATKReportTable; +import org.broadinstitute.gatk.utils.report.GATKReport; +import org.broadinstitute.gatk.utils.report.GATKReportTable; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.SimpleTimer; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/ExactAFCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/ExactAFCalculator.java index fe5f0c2e0..3437c6da3 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/ExactAFCalculator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/ExactAFCalculator.java @@ -52,6 +52,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc; import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.variantcontext.*; @@ -80,9 +81,9 @@ abstract class ExactAFCalculator extends AFCalculator { protected static final Comparator LIKELIHOOD_NON_REF_THEN_SUM_COMPARATOR = new Comparator() { @Override public int compare(final LikelihoodSum o1, final LikelihoodSum o2) { - if (o1.allele == GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) + if (o1.allele == GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) return 1; - else if (o2.allele == GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) + else if (o2.allele == GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) return -1; else return o1.compareTo(o2); @@ -182,7 +183,7 @@ abstract class ExactAFCalculator extends AFCalculator { final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); final int nonRefAltAlleleIndex = GATKVariantContextUtils.indexOfAltAllele(vc, - GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE, false); + GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE, false); final boolean nonRefAltAllelePresent = nonRefAltAlleleIndex >= 0; // should not be considered in the downsizing, so we need to count it out when diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/FixedAFCalculatorProvider.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/FixedAFCalculatorProvider.java index 132a1b7cb..53bfbcbe6 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/FixedAFCalculatorProvider.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/FixedAFCalculatorProvider.java @@ -56,7 +56,7 @@ import org.apache.log4j.Logger; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; import org.broadinstitute.gatk.engine.arguments.GenotypeCalculationArgumentCollection; -import org.broadinstitute.gatk.engine.arguments.StandardCallerArgumentCollection; +import org.broadinstitute.gatk.tools.walkers.genotyper.StandardCallerArgumentCollection; /** * A single fixed instance AF calculator provider. diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalculator.java index 5dda21ebc..4ca9d7a56 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalculator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalculator.java @@ -56,7 +56,7 @@ import com.google.java.contract.Requires; import htsjdk.variant.variantcontext.*; import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculators; import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import java.util.*; @@ -195,7 +195,7 @@ import java.util.*; else { final VariantContextBuilder vcb = new VariantContextBuilder(vc); final Allele reference = vcb.getAlleles().get(0); - vcb.alleles(Arrays.asList(reference, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE)); + vcb.alleles(Arrays.asList(reference, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE)); final int genotypeCount = GenotypeLikelihoodCalculators.genotypeCount(2, vc.getNAlleles()); final double[] hetLikelihoods = new double[vc.getNAlleles() - 1]; final double[] homAltLikelihoods = new double[genotypeCount - hetLikelihoods.length - 1]; @@ -213,7 +213,7 @@ import java.util.*; else if (oldAllele.isNoCall()) newAlleles.add(Allele.NO_CALL); else - newAlleles.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + newAlleles.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); } gb.alleles(newAlleles); } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ActiveRegionTrimmer.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ActiveRegionTrimmer.java index 3fa393eb0..6a8564b28 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ActiveRegionTrimmer.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ActiveRegionTrimmer.java @@ -72,7 +72,7 @@ import java.util.*; * * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> */ -class ActiveRegionTrimmer { +public class ActiveRegionTrimmer { /** * Genome location parser use in order to create and manipulate genomic intervals. @@ -115,11 +115,11 @@ class ActiveRegionTrimmer { */ @Hidden @Argument(fullName="paddingAroundIndels", shortName="paddingAroundIndels", doc = "Include at least this many bases around an event for calling indels", required=false) - protected int indelPadding = 150; + public int indelPadding = 150; @Hidden @Argument(fullName="paddingAroundSNPs", shortName="paddingAroundSNPs", doc = "Include at least this many bases around an event for calling snps", required=false) - protected int snpPadding = 20; + public int snpPadding = 20; /** * Holds a reference the trimmer logger. @@ -143,7 +143,7 @@ class ActiveRegionTrimmer { * @throws IllegalArgumentException if the input location parser is {@code null}. * @throws UserException.BadArgumentValue if any of the user argument values is invalid. */ - void initialize(final GenomeLocParser glp, final boolean debug, final boolean isGGA, final boolean emitReferenceConfidence) { + public void initialize(final GenomeLocParser glp, final boolean debug, final boolean isGGA, final boolean emitReferenceConfidence) { if (locParser != null) throw new IllegalStateException(getClass().getSimpleName() + " instance initialized twice"); if (glp == null) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/AssemblyBasedCallerArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/AssemblyBasedCallerArgumentCollection.java new file mode 100644 index 000000000..06f390e71 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/AssemblyBasedCallerArgumentCollection.java @@ -0,0 +1,139 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.haplotypecaller; + +import org.broadinstitute.gatk.tools.walkers.genotyper.StandardCallerArgumentCollection; +import org.broadinstitute.gatk.utils.commandline.Advanced; +import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.utils.commandline.Output; +import org.broadinstitute.gatk.utils.haplotypeBAMWriter.HaplotypeBAMWriter; +import org.broadinstitute.gatk.utils.sam.GATKSAMFileWriter; + +/** + * Set of arguments for Assembly Based Callers + * + * @author Kristian Cibulskis <kcibul@broadinstitute.org> + */ +public class AssemblyBasedCallerArgumentCollection extends StandardCallerArgumentCollection { + + @Advanced + @Argument(fullName="debug", shortName="debug", doc="Print out very verbose debug information about each triggering active region", required = false) + public boolean DEBUG; + + @Advanced + @Argument(fullName="useFilteredReadsForAnnotations", shortName="useFilteredReadsForAnnotations", doc = "Use the contamination-filtered read maps for the purposes of annotating variants", required=false) + public boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = false; + + /** + * The reference confidence mode makes it possible to emit a per-bp or summarized confidence estimate for a site being strictly homozygous-reference. + * See http://www.broadinstitute.org/gatk/guide/article?id=2940 for more details of how this works. + * Note that if you set -ERC GVCF, you also need to set -variant_index_type LINEAR and -variant_index_parameter 128000 (with those exact values!). + * This requirement is a temporary workaround for an issue with index compression. + */ + @Advanced + @Argument(fullName="emitRefConfidence", shortName="ERC", doc="Mode for emitting reference confidence scores", required = false) + protected ReferenceConfidenceMode emitReferenceConfidence = ReferenceConfidenceMode.NONE; + + @Override + public AssemblyBasedCallerArgumentCollection clone() { + return (AssemblyBasedCallerArgumentCollection) super.clone(); + } + + /** + * The assembled haplotypes will be written as BAM to this file if requested. Really for debugging purposes only. + * Note that the output here does not include uninformative reads so that not every input read is emitted to the bam. + * + * Turning on this mode may result in serious performance cost for the HC. It's really only appropriate to + * use in specific areas where you want to better understand why the HC is making specific calls. + * + * The reads are written out containing an "HC" tag (integer) that encodes which haplotype each read best matches + * according to the haplotype caller's likelihood calculation. The use of this tag is primarily intended + * to allow good coloring of reads in IGV. Simply go to "Color Alignments By > Tag" and enter "HC" to more + * easily see which reads go with these haplotype. + * + * Note that the haplotypes (called or all, depending on mode) are emitted as single reads covering the entire + * active region, coming from read HC and a special read group. + * + * Note also that only reads that are actually informative about the haplotypes are emitted. By informative we mean + * that there's a meaningful difference in the likelihood of the read coming from one haplotype compared to + * its next best haplotype. + * + * The best way to visualize the output of this mode is with IGV. Tell IGV to color the alignments by tag, + * and give it the HC tag, so you can see which reads support each haplotype. Finally, you can tell IGV + * to group by sample, which will separate the potential haplotypes from the reads. All of this can be seen in + * this screenshot + * + */ + @Advanced + @Output(fullName="bamOutput", shortName="bamout", doc="File to which assembled haplotypes should be written", required = false, defaultToStdout = false) + public GATKSAMFileWriter bamWriter = null; + + /** + * The type of BAM output we want to see. This determines whether HC will write out all of the haplotypes it + * considered (top 128 max) or just the ones that were selected as alleles and assigned to samples. + */ + @Advanced + @Argument(fullName="bamWriterType", shortName="bamWriterType", doc="Which haplotypes should be written to the BAM", required = false) + public HaplotypeBAMWriter.Type bamWriterType = HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES; + + /** + * If set, certain "early exit" optimizations in HaplotypeCaller, which aim to save compute and time by skipping + * calculations if an ActiveRegion is determined to contain no variants, will be disabled. This is most likely to be useful if + * you're using the -bamout argument to examine the placement of reads following reassembly and are interested in seeing the mapping of + * reads in regions with no variations. Setting the -forceActive and -dontTrimActiveRegions flags may also be necessary. + */ + @Advanced + @Argument(fullName = "disableOptimizations", shortName="disableOptimizations", doc="Don't skip calculations in ActiveRegions with no variants", + required = false) + public boolean disableOptimizations = false; + +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java index 31dab29e7..b70765402 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java @@ -52,7 +52,7 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.graphs.SeqGraph; import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java index 3b01d036e..91120f43d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java @@ -53,9 +53,9 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import htsjdk.variant.variantcontext.Allele; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.tools.walkers.genotyper.AlleleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedAlleleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.AlleleList; +import org.broadinstitute.gatk.utils.genotyper.IndexedAlleleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.graphs.MultiSampleEdge; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.graphs.Path; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.graphs.Route; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java index f4622fa30..3d3299053 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -57,18 +57,22 @@ import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.engine.CommandLineGATK; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.downsampling.AlleleBiasedDownsamplingUtils; -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; -import org.broadinstitute.gatk.engine.downsampling.DownsamplingUtils; +import org.broadinstitute.gatk.engine.io.DirectOutputTracker; +import org.broadinstitute.gatk.engine.io.stubs.SAMFileWriterStub; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.downsampling.AlleleBiasedDownsamplingUtils; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingUtils; import org.broadinstitute.gatk.engine.filters.BadMateFilter; -import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; +import org.broadinstitute.gatk.utils.genotyper.*; import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.*; import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; @@ -88,12 +92,9 @@ import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.gatk.utils.fragments.FragmentCollection; import org.broadinstitute.gatk.utils.fragments.FragmentUtils; -import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.gga.GenotypingGivenAllelesUtils; import org.broadinstitute.gatk.utils.gvcf.GVCFWriter; import org.broadinstitute.gatk.utils.haplotype.Haplotype; -import org.broadinstitute.gatk.utils.haplotype.LDMerger; -import org.broadinstitute.gatk.utils.haplotype.MergeVariantsAcrossHaplotypes; import org.broadinstitute.gatk.utils.haplotypeBAMWriter.HaplotypeBAMWriter; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; @@ -101,16 +102,13 @@ import org.broadinstitute.gatk.utils.pairhmm.PairHMM; import org.broadinstitute.gatk.utils.sam.AlignmentUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.ReadUtils; -import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType; -import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.gatk.utils.variant.HomoSapiensConstants; +import org.broadinstitute.gatk.utils.variant.*; import java.io.FileNotFoundException; -import java.io.PrintStream; import java.util.*; /** - * Call SNPs and indels simultaneously via local re-assembly of haplotypes in an active region. + * Call SNPs and indels simultaneously via local re-assembly of haplotypes in an active region * *

        The basic operation of the HaplotypeCaller proceeds as follows:

        * @@ -141,7 +139,6 @@ import java.util.*; * read data to calculate the likelihoods of each genotype per sample given the read data observed for that * sample. The most likely genotype is then assigned to the sample.

        * - *
        *

        Input

        *

        * Input bam file(s) from which to make calls @@ -149,23 +146,25 @@ import java.util.*; * *

        Output

        *

        - * VCF file with raw, unfiltered SNP and indel calls. These must be filtered either by variant recalibration (best) or hard-filtering before use in downstream analyses. + * Either a VCF or gVCF file with raw, unfiltered SNP and indel calls. Regular VCFs must be filtered either by variant + * recalibration (best) or hard-filtering before use in downstream analyses. If using the reference-confidence model + * workflow for cohort analysis, the output is a GVCF file that must first be run through GenotypeGVCFs and then + * filtering before further analysis. *

        * - *

        Examples

        + *

        Usage examples

        * *

        These are example commands that show how to run HaplotypeCaller for typical use cases. Square brackets ("[ ]") * indicate optional arguments. Note that parameter values shown here may not be the latest recommended; see the * Best Practices documentation for detailed recommendations.

        * *
        - *

        Single-sample all-sites calling on DNAseq (for GVCF-based cohort analysis workflow)

        - *

        + *

        Single-sample all-sites calling on DNAseq (for `-ERC GVCF` cohort analysis workflow)

        *
          *   java
          *     -jar GenomeAnalysisTK.jar
          *     -T HaplotypeCaller
        - *     -R reference/human_g1k_v37.fasta
        + *     -R reference.fasta
          *     -I sample1.bam \
          *     --emitRefConfidence GVCF \
          *     --variant_index_type LINEAR \
        @@ -174,15 +173,13 @@ import java.util.*;
          *     [-L targets.interval_list] \
          *     -o output.raw.snps.indels.g.vcf
          * 
        - *

        * *

        Variant-only calling on DNAseq

        - *

        *

          *   java
          *     -jar GenomeAnalysisTK.jar
          *     -T HaplotypeCaller
        - *     -R reference/human_g1k_v37.fasta
        + *     -R reference.fasta
          *     -I sample1.bam [-I sample2.bam ...] \
          *     [--dbsnp dbSNP.vcf] \
          *     [-stand_call_conf 30] \
        @@ -190,32 +187,40 @@ import java.util.*;
          *     [-L targets.interval_list] \
          *     -o output.raw.snps.indels.vcf
          * 
        - *

        * *

        Variant-only calling on RNAseq

        - *

        *

          *   java
          *     -jar GenomeAnalysisTK.jar
          *     -T HaplotypeCaller
        - *     -R reference/human_g1k_v37.fasta
        + *     -R reference.fasta
          *     -I sample1.bam \
        - *     -dontUseSoftClippedBases \
          *     [--dbsnp dbSNP.vcf] \
          *     -stand_call_conf 20 \
          *     -stand_emit_conf 20 \
          *     -o output.raw.snps.indels.vcf
          * 
        - *

        * *

        Caveats

        *
          *
        • We have not yet fully tested the interaction between the GVCF-based calling or the multisample calling and the - * RNAseq-specific functionalities.Use those in combination at your own risk.
        • + * RNAseq-specific functionalities. Use those in combination at your own risk. *
        • Many users have reported issues running HaplotypeCaller with the -nct argument, so we recommend using Queue to * parallelize HaplotypeCaller instead of multithreading.
        • *
        * + *

        Special note on ploidy

        + *

        This tool is able to handle almost any ploidy (except very high ploidies in large pooled experiments); the ploidy + * can be specified using the -ploidy argument for non-diploid organisms.

        + * + *

        Additional Notes

        + *
          + *
        • When working with PCR-free data, be sure to set `-pcr_indel_model NONE` (see argument below).
        • + *
        • When running in `-ERC GVCF` or `-ERC BP_RESOLUTION` modes, the emitting and calling confidence thresholds + * are automatically set to 0. This cannot be overridden by the command line. The thresholds can be set manually + * to the desired levels in the next step of the workflow (GenotypeGVCFs)
        • + *
        + * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) @@ -223,7 +228,7 @@ import java.util.*; @BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) @ActiveRegionTraversalParameters(extension=100, maxRegion=300) @ReadFilters({HCMappingQualityFilter.class}) -@Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=250) +@Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=500) public class HaplotypeCaller extends ActiveRegionWalker, Integer> implements AnnotatorCompatible, NanoSchedulable { // ----------------------------------------------------------------------------------------------- // general haplotype caller arguments @@ -238,69 +243,16 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Hidden @Advanced @Argument(fullName="likelihoodCalculationEngine",shortName="likelihoodEngine", - doc="what likelihood calculation engine to use to calculate the relative likelihood of reads vs haplotypes",required=false) + doc="What likelihood calculation engine to use to calculate the relative likelihood of reads vs haplotypes",required=false) protected ReadLikelihoodCalculationEngine.Implementation likelihoodEngineImplementation = ReadLikelihoodCalculationEngine.Implementation.PairHMM; @Hidden @Advanced - @Argument(fullName="heterogeneousKmerSizeResolution",shortName="hksr",doc="how to solve heterogeneous kmer situations using the fast method",required=false) + @Argument(fullName="heterogeneousKmerSizeResolution",shortName="hksr",doc="How to solve heterogeneous kmer situations using the fast method",required=false) protected HeterogeneousKmerSizeResolution heterogeneousKmerSizeResolution = HeterogeneousKmerSizeResolution.COMBO_MIN; - /** - * This argument is meant for debugging and is not immediately useful for normal analysis use. - */ - @Output(fullName="graphOutput", shortName="graph", doc="Write debug assembly graph information to this file", required = false, defaultToStdout = false) - protected PrintStream graphWriter = null; - - /** - * The assembled haplotypes will be written as BAM to this file if requested. Really for debugging purposes only. - * Note that the output here does not include uninformative reads so that not every input read is emitted to the bam. - * - * Turning on this mode may result in serious performance cost for the HC. It's really only appropriate to - * use in specific areas where you want to better understand why the HC is making specific calls. - * - * The reads are written out containing a HC tag (integer) that encodes which haplotype each read best matches - * according to the haplotype caller's likelihood calculation. The use of this tag is primarily intended - * to allow good coloring of reads in IGV. Simply go to Color Alignments By > Tag and enter HC to more - * easily see which reads go with these haplotype. - * - * Note that the haplotypes (called or all, depending on mode) are emitted as single reads covering the entire - * active region, coming from read HC and a special read group. - * - * Note that only reads that are actually informative about the haplotypes are emitted. By informative we mean - * that there's a meaningful difference in the likelihood of the read coming from one haplotype compared to - * its next best haplotype. - * - * The best way to visualize the output of this mode is with IGV. Tell IGV to color the alignments by tag, - * and give it the HC tag, so you can see which reads support each haplotype. Finally, you can tell IGV - * to group by sample, which will separate the potential haplotypes from the reads. All of this can be seen - * in the following screenshot: https://www.dropbox.com/s/xvy7sbxpf13x5bp/haplotypecaller%20bamout%20for%20docs.png - * - */ - @Advanced - @Output(fullName="bamOutput", shortName="bamout", doc="File to which assembled haplotypes should be written", required = false, defaultToStdout = false) - protected GATKSAMFileWriter bamWriter = null; private HaplotypeBAMWriter haplotypeBAMWriter; - /** - * The type of BAM output we want to see. This determines whether HC will write out all of the haplotypes it - * considered (top 128 max) or just the ones that were selected as alleles and assigned to samples. - */ - @Advanced - @Argument(fullName="bamWriterType", shortName="bamWriterType", doc="Which haplotypes should be written to the BAM?", required = false) - public HaplotypeBAMWriter.Type bamWriterType = HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES; - - /** - * If set, certain "early exit" optimizations in HaplotypeCaller, which aim to save compute and time by skipping - * calculations if an ActiveRegion is determined to contain no variants, will be disabled. This is most likely to be useful if - * you're using the -bamout argument to examine the placement of reads following reassembly and are interested in seeing the mapping of - * reads in regions with no variations. Setting the -forceActive and -dontTrimActiveRegions flags may also be necessary. - */ - @Advanced - @Argument(fullName = "disableOptimizations", shortName="disableOptimizations", doc="Don't skip calculations in ActiveRegions with no variants", - required = false) - private boolean disableOptimizations = false; - /** * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. * dbSNP is not used in any way for the calculations themselves. @@ -323,7 +275,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In * filtered in the comp track will be ignored. Note that 'dbSNP' has been special-cased (see the --dbsnp argument). */ @Advanced - @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false) + @Input(fullName="comp", shortName = "comp", doc="Comparison VCF file", required=false) public List> comps = Collections.emptyList(); public List> getCompRodBindings() { return comps; } @@ -333,19 +285,24 @@ public class HaplotypeCaller extends ActiveRegionWalker, In public boolean alwaysAppendDbsnpId() { return false; } /** - * Which annotations to add to the output VCF file. The single value 'none' removes the default annotations. See the VariantAnnotator -list argument to view available annotations. + * Which annotations to add to the output VCF file. The single value 'none' removes the default annotations. + * See the VariantAnnotator -list argument to view available annotations. */ @Advanced @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"ClippingRankSumTest", "DepthPerSampleHC"})); /** - * Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments, - * so these annotations will be excluded even if they are explicitly included with the other options. + * Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the + * -A or -G arguments, so these annotations will be excluded even if they are explicitly included with the other + * options. When HaplotypeCaller is run with -ERC GVCF or -ERC BP_RESOLUTION, some annotations are excluded from the + * output by default because they will only be meaningful once they have been recalculated by GenotypeGVCFs. As + * of version 3.3 this concerns ChromosomeCounts, FisherStrand, StrandOddsRatio and QualByDepth. + * */ @Advanced @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) - protected List annotationsToExclude = new ArrayList<>(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"})); + protected List annotationsToExclude = new ArrayList<>(Arrays.asList(new String[]{})); /** * Which groups of annotations to add to the output VCF file. The single value 'none' removes the default group. See @@ -357,7 +314,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In protected String[] annotationClassesToUse = { "Standard" }; @ArgumentCollection - private HaplotypeCallerArgumentCollection SCAC = new HaplotypeCallerArgumentCollection(); + private HaplotypeCallerArgumentCollection HCAC = new HaplotypeCallerArgumentCollection(); + + @ArgumentCollection + private LikelihoodEngineArgumentCollection LEAC = new LikelihoodEngineArgumentCollection(); /** * You can use this argument to specify that HC should process a single sample out of a multisample BAM file. This @@ -367,70 +327,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="sample_name", shortName = "sn", doc="Name of single sample to use from a multi-sample bam", required=false) protected String sampleNameToUse = null; - // ----------------------------------------------------------------------------------------------- - // arguments to control internal behavior of the read threading assembler - // ----------------------------------------------------------------------------------------------- - - /** - * Multiple kmer sizes can be specified, using e.g. `-kmerSize 10 -kmerSize 25`. - */ - @Advanced - @Argument(fullName="kmerSize", shortName="kmerSize", doc="Kmer size to use in the read threading assembler", required = false) - protected List kmerSizes = Arrays.asList(10, 25); - - /** - * When graph cycles are detected, the normal behavior is to increase kmer sizes iteratively until the cycles are - * resolved. Disabling this behavior may cause the program to give up on assembling the ActiveRegion. - */ - @Advanced - @Argument(fullName="dontIncreaseKmerSizesForCycles", shortName="dontIncreaseKmerSizesForCycles", doc="Disable iterating over kmer sizes when graph cycles are detected", required = false) - protected boolean dontIncreaseKmerSizesForCycles = false; - - /** - * By default, the program does not allow processing of reference sections that contain non-unique kmers. Disabling - * this check may cause problems in the assembly graph. - */ - @Advanced - @Argument(fullName="allowNonUniqueKmersInRef", shortName="allowNonUniqueKmersInRef", doc="Allow graphs that have non-unique kmers in the reference", required = false) - protected boolean allowNonUniqueKmersInRef = false; - - /** - * If fewer samples than the specified number pass the minPruning threshold for a given path, that path will be eliminated from the graph. - */ - @Advanced - @Argument(fullName="numPruningSamples", shortName="numPruningSamples", doc="Number of samples that must pass the minPruning threshold", required = false) - protected int numPruningSamples = 1; - - /** - * As of version 3.3, this argument is no longer needed because dangling end recovery is now the default behavior. See GATK 3.3 release notes for more details. - */ - @Deprecated - @Argument(fullName="recoverDanglingHeads", shortName="recoverDanglingHeads", doc="This argument is deprecated since version 3.3", required = false) - protected boolean DEPRECATED_RecoverDanglingHeads = false; - - /** - * By default, the read threading assembler will attempt to recover dangling heads and tails. See the `minDanglingBranchLength` argument documentation for more details. - */ - @Hidden - @Argument(fullName="doNotRecoverDanglingBranches", shortName="doNotRecoverDanglingBranches", doc="Disable dangling head and tail recovery", required = false) - protected boolean doNotRecoverDanglingBranches = false; - - /** - * When constructing the assembly graph we are often left with "dangling" branches. The assembly engine attempts to rescue these branches - * by merging them back into the main graph. This argument describes the minimum length of a dangling branch needed for the engine to - * try to rescue it. A smaller number here will lead to higher sensitivity to real variation but also to a higher number of false positives. - */ - @Advanced - @Argument(fullName="minDanglingBranchLength", shortName="minDanglingBranchLength", doc="Minimum length of a dangling branch to attempt recovery", required = false) - protected int minDanglingBranchLength = 4; - - /** - * This argument is specifically intended for 1000G consensus analysis mode. Setting this flag will inject all - * provided alleles to the assembly graph but will not forcibly genotype all of them. - */ - @Advanced - @Argument(fullName="consensus", shortName="consensus", doc="1000G consensus mode", required = false) - protected boolean consensusMode = false; + @ArgumentCollection + private ReadThreadingAssemblerArgumentCollection RTAC = new ReadThreadingAssemblerArgumentCollection(); // ----------------------------------------------------------------------------------------------- // general advanced arguments to control haplotype caller behavior @@ -474,29 +372,12 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false) public byte MIN_BASE_QUALTY_SCORE = 10; - /** - * Paths with fewer supporting kmers than the specified threshold will be pruned from the graph. - * - * Be aware that this argument can dramatically affect the results of variant calling and should only be used with great caution. - * Using a prune factor of 1 (or below) will prevent any pruning from the graph, which is generally not ideal; it can make the - * calling much slower and even less accurate (because it can prevent effective merging of "tails" in the graph). Higher values - * tend to make the calling much faster, but also lowers the sensitivity of the results (because it ultimately requires higher - * depth to produce calls). - */ - @Advanced - @Argument(fullName="minPruning", shortName="minPruning", doc = "Minimum support to not prune paths in the graph", required = false) - protected int MIN_PRUNE_FACTOR = 2; - - @Advanced - @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false) - protected int gcpHMM = 10; /** - * If this flag is provided, the haplotype caller will include unmapped reads (that have chromosomal coordinates) in the assembly and calling - * when these reads occur in the region being analyzed. Typically, for paired end analyses, one pair of the - * read can map, but if its pair is too divergent then it may be unmapped and placed next to its mate, taking - * the mates contig and alignment start. If this flag is provided the haplotype caller will see such reads, - * and may make use of them in assembly and calling, where possible. + * If this flag is provided, the HaplotypeCaller will include unmapped reads (that have chromosomal coordinates) in the assembly and calling + * when these reads occur in the region being analyzed. This situation can occur in paired end analyses, when one read in the read pair + * gets mapped but its mate is too divergent. In that case, the mate will be marked as unmapped and placed next to the first read, assigned to the same + * contig and alignment start. If this flag is provided, the HaplotypeCaller will see such reads, and may make use of them in assembly and calling, where possible. */ @Hidden @Argument(fullName="includeUmappedReads", shortName="unmapped", doc="Include unmapped reads with chromosomal coordinates", required = false) @@ -507,59 +388,18 @@ public class HaplotypeCaller extends ActiveRegionWalker, In protected boolean USE_ALLELES_TRIGGER = false; /** - * The phredScaledGlobalReadMismappingRate reflects the average global mismapping rate of all reads, regardless of their - * mapping quality. This term effects the probability that a read originated from the reference haplotype, regardless of - * its edit distance from the reference, in that the read could have originated from the reference haplotype but - * from another location in the genome. Suppose a read has many mismatches from the reference, say like 5, but - * has a very high mapping quality of 60. Without this parameter, the read would contribute 5 * Q30 evidence - * in favor of its 5 mismatch haplotype compared to reference, potentially enough to make a call off that single - * read for all of these events. With this parameter set to Q30, though, the maximum evidence against any haplotype - * that this (and any) read could contribute is Q30. - * - * Set this term to any negative number to turn off the global mapping rate. + * As of GATK 3.3, HaplotypeCaller outputs physical (read-based) information (see version 3.3 release notes and documentation for details). This argument disables that behavior. */ @Advanced - @Argument(fullName="phredScaledGlobalReadMismappingRate", shortName="globalMAPQ", doc="The global assumed mismapping rate for reads", required = false) - protected int phredScaledGlobalReadMismappingRate = 45; - - /** - * The assembly graph can be quite complex, and could imply a very large number of possible haplotypes. Each haplotype - * considered requires N PairHMM evaluations if there are N reads across all samples. In order to control the - * run of the haplotype caller we only take maxNumHaplotypesInPopulation paths from the graph, in order of their - * weights, no matter how many paths are possible to generate from the graph. Putting this number too low - * will result in dropping true variation because paths that include the real variant are not even considered. - * You can consider increasing this number when calling organisms with high heterozygosity. - */ - @Advanced - @Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population", required = false) - protected int maxNumHaplotypesInPopulation = 128; - - @Advanced - @Argument(fullName="mergeVariantsViaLD", shortName="mergeVariantsViaLD", doc="Merge variants together into block substitutions if they are in strong local LD", required = false) - protected boolean mergeVariantsViaLD = false; - - /** - * As of GATK 3.3, HaplotypeCaller outputs physical information (see release notes and documentation for details). This argument disables that behavior. - */ - @Advanced - @Argument(fullName="doNotRunPhysicalPhasing", shortName="doNotRunPhysicalPhasing", doc="Don't try to add physical (read-based) phasing information", required = false) + @Argument(fullName="doNotRunPhysicalPhasing", shortName="doNotRunPhysicalPhasing", doc="Disable physical phasing", required = false) protected boolean doNotRunPhysicalPhasing = false; - public static final String HAPLOTYPE_CALLER_PHASING_ID_KEY = "PID"; - public static final String HAPLOTYPE_CALLER_PHASING_GT_KEY = "PGT"; - // ----------------------------------------------------------------------------------------------- // arguments for debugging / developing the haplotype caller // ----------------------------------------------------------------------------------------------- - /** - * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. - */ - @Hidden - @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false) - public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.VECTOR_LOGLESS_CACHING; @Hidden - @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) + @Argument(fullName="keepRG", shortName="keepRG", doc="Only use reads from this read group when making calls (but use all reads to build the assembly)", required = false) protected String keepRG = null; /** @@ -576,17 +416,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "Perform assembly but do not genotype variants", required=false) protected boolean dontGenotype = false; - /** - * Enabling this argument may cause fundamental problems with the assembly graph itself. - */ - @Hidden - @Argument(fullName="errorCorrectKmers", shortName="errorCorrectKmers", doc = "Use an exploratory algorithm to error correct the kmers used during assembly", required=false) - protected boolean errorCorrectKmers = false; - - @Hidden - @Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="Write DOT formatted graph files out of the assembler for only this graph size", required = false) - protected boolean debugGraphTransformations = false; - @Advanced @Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="Do not analyze soft clipped bases in the reads", required = false) protected boolean dontUseSoftClippedBases = false; @@ -595,14 +424,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="captureAssemblyFailureBAM", shortName="captureAssemblyFailureBAM", doc="Write a BAM called assemblyFailure.bam capturing all of the reads that were in the active region when the assembler failed for any reason", required = false) protected boolean captureAssemblyFailureBAM = false; - @Hidden - @Argument(fullName="allowCyclesInKmerGraphToGeneratePaths", shortName="allowCyclesInKmerGraphToGeneratePaths", doc="Allow cycles in the kmer graphs to generate paths with multiple copies of the path sequenece rather than just the shortest paths", required = false) - protected boolean allowCyclesInKmerGraphToGeneratePaths = false; - - @Hidden - @Argument(fullName="noFpga", shortName="noFpga", doc="Disable the use of the FPGA HMM implementation", required = false) - protected boolean noFpga = false; - // Parameters to control read error correction /** * Enabling this argument may cause fundamental problems with the assembly graph itself. @@ -611,17 +432,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="errorCorrectReads", shortName="errorCorrectReads", doc = "Use an exploratory algorithm to error correct the kmers used during assembly", required=false) protected boolean errorCorrectReads = false; - /** - * Enabling this argument may cause fundamental problems with the assembly graph itself. - */ - @Hidden - @Argument(fullName="kmerLengthForReadErrorCorrection", shortName="kmerLengthForReadErrorCorrection", doc = "Use an exploratory algorithm to error correct the kmers used during assembly", required=false) - protected int kmerLengthForReadErrorCorrection = 25; - - @Hidden - @Argument(fullName="minObservationsForKmerToBeSolid", shortName="minObservationsForKmerToBeSolid", doc = "A k-mer must be seen at least these times for it considered to be solid", required=false) - protected int minObservationsForKmerToBeSolid = 20; - /** * When calculating the likelihood of variants, we can try to correct for PCR errors that cause indel artifacts. * The correction is based on the reference context, and acts specifically around repetitive sequences that tend @@ -662,10 +472,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In * not reducing coverage at any read start to less than minReadsPerAlignmentStart */ @Argument(fullName = "maxReadsInRegionPerSample", shortName = "maxReadsInRegionPerSample", doc="Maximum reads in an active region", required = false) - protected int maxReadsInRegionPerSample = 1000; + protected int maxReadsInRegionPerSample = 10000; @Argument(fullName = "minReadsPerAlignmentStart", shortName = "minReadsPerAlignStart", doc="Minimum number of reads sharing the same alignment start for each genomic location in an active region", required = false) - protected int minReadsPerAlignmentStart = 5; + protected int minReadsPerAlignmentStart = 10; private byte MIN_TAIL_QUALITY; private static final byte MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION = 6; @@ -680,9 +490,21 @@ public class HaplotypeCaller extends ActiveRegionWalker, In ReferenceConfidenceModel referenceConfidenceModel = null; - // as determined experimentally Nov-Dec 2013 - public final static GATKVCFIndexType OPTIMAL_GVCF_INDEX_TYPE = GATKVCFIndexType.LINEAR; - public final static int OPTIMAL_GVCF_INDEX_PARAMETER = 128000; + + //////////////////////////////////////////////////////////////////////// + //// Deprecated Arguments //// + //// Keeping them here is meant to provide informative error messages // + //// when an argument has been put out of service //// + //////////////////////////////////////////////////////////////////////// + /** + * @deprecated + * Deprecated: 2015-04-01, J.White + * mergeVariantsViaLD = false made final + */ + @Hidden + @Deprecated + @Argument(fullName="mergeVariantsViaLD", shortName="mergeVariantsViaLD", doc="DEPRECATED; This argument is no longer used in GATK versions 3.4 and newer. Please see the online documentation for the latest usage recommendations.", required = false) + static final boolean mergeVariantsViaLD = false; //--------------------------------------------------------------------------------------------------------------- // @@ -693,21 +515,21 @@ public class HaplotypeCaller extends ActiveRegionWalker, In public void initialize() { super.initialize(); - if (SCAC.genotypeArgs.samplePloidy != HomoSapiensConstants.DEFAULT_PLOIDY && !doNotRunPhysicalPhasing) { + if (HCAC.genotypeArgs.samplePloidy != HomoSapiensConstants.DEFAULT_PLOIDY && !doNotRunPhysicalPhasing) { doNotRunPhysicalPhasing = true; logger.info("Currently, physical phasing is not available when ploidy is different than " + HomoSapiensConstants.DEFAULT_PLOIDY + "; therefore it won't be performed"); } if (dontGenotype && emitReferenceConfidence()) - throw new UserException("You cannot request gVCF output and do not genotype at the same time"); + throw new UserException("You cannot request gVCF output and 'do not genotype' at the same time"); if ( emitReferenceConfidence() ) { - if (SCAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES) + if (HCAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES) throw new UserException.BadArgumentValue("ERC/gt_mode","you cannot request reference confidence output and GENOTYPE_GIVEN_ALLELES at the same time"); - SCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING = -0.0; - SCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING = -0.0; + HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING = -0.0; + HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING = -0.0; // also, we don't need to output several of the annotations annotationsToExclude.add("ChromosomeCounts"); @@ -718,9 +540,9 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // but we definitely want certain other ones annotationsToUse.add("StrandBiasBySample"); logger.info("Standard Emitting and Calling confidence set to 0.0 for reference-model confidence output"); - if (!SCAC.annotateAllSitesWithPLs) + if (!HCAC.annotateAllSitesWithPLs) logger.info("All sites annotated with PLs forced to true for reference-model confidence output"); - SCAC.annotateAllSitesWithPLs = true; + HCAC.annotateAllSitesWithPLs = true; } else if ( ! doNotRunPhysicalPhasing ) { doNotRunPhysicalPhasing = true; logger.info("Disabling physical phasing, which is supported only for reference-model confidence output"); @@ -744,31 +566,31 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested - final UnifiedArgumentCollection simpleUAC = SCAC.cloneTo(UnifiedArgumentCollection.class); + final UnifiedArgumentCollection simpleUAC = HCAC.cloneTo(UnifiedArgumentCollection.class); simpleUAC.outputMode = OutputMode.EMIT_VARIANTS_ONLY; simpleUAC.genotypingOutputMode = GenotypingOutputMode.DISCOVERY; - simpleUAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, SCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling - simpleUAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, SCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, HCAC.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling simpleUAC.CONTAMINATION_FRACTION = 0.0; simpleUAC.CONTAMINATION_FRACTION_FILE = null; simpleUAC.exactCallsLog = null; // Seems that at least with some test data we can lose genuine haploid variation if we use // UGs engine with ploidy == 1 - simpleUAC.genotypeArgs.samplePloidy = Math.max(2,SCAC.genotypeArgs.samplePloidy); + simpleUAC.genotypeArgs.samplePloidy = Math.max(2, HCAC.genotypeArgs.samplePloidy); activeRegionEvaluationGenotyperEngine = new UnifiedGenotypingEngine(simpleUAC, FixedAFCalculatorProvider.createThreadSafeProvider(getToolkit(),simpleUAC,logger), toolkit); activeRegionEvaluationGenotyperEngine.setLogger(logger); - if( SCAC.CONTAMINATION_FRACTION_FILE != null ) - SCAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(SCAC.CONTAMINATION_FRACTION_FILE, SCAC.CONTAMINATION_FRACTION, sampleSet, logger)); + if( HCAC.CONTAMINATION_FRACTION_FILE != null ) + HCAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(HCAC.CONTAMINATION_FRACTION_FILE, HCAC.CONTAMINATION_FRACTION, sampleSet, logger)); - if( SCAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES && consensusMode ) + if( HCAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES && RTAC.consensusMode ) throw new UserException("HaplotypeCaller cannot be run in both GENOTYPE_GIVEN_ALLELES mode and in consensus mode at the same time. Please choose one or the other."); final GenomeLocParser genomeLocParser = toolkit.getGenomeLocParser(); - genotypingEngine = new HaplotypeCallerGenotypingEngine( SCAC, samplesList, genomeLocParser, FixedAFCalculatorProvider.createThreadSafeProvider(getToolkit(),SCAC,logger), !doNotRunPhysicalPhasing); + genotypingEngine = new HaplotypeCallerGenotypingEngine(HCAC, samplesList, genomeLocParser, FixedAFCalculatorProvider.createThreadSafeProvider(getToolkit(), HCAC,logger), !doNotRunPhysicalPhasing); // initialize the output VCF header final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); @@ -778,10 +600,9 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // all annotation fields from VariantAnnotatorEngine headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions()); // all callers need to add these standard annotation header lines - VCFStandardHeaderLines.addStandardInfoLines(headerInfo, true, - VCFConstants.DOWNSAMPLED_KEY, - VCFConstants.MLE_ALLELE_COUNT_KEY, - VCFConstants.MLE_ALLELE_FREQUENCY_KEY); + headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.DOWNSAMPLED_KEY)); + headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_COUNT_KEY)); + headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY)); // all callers need to add these standard FORMAT field header lines VCFStandardHeaderLines.addStandardFormatLines(headerInfo, true, VCFConstants.GENOTYPE_KEY, @@ -790,18 +611,21 @@ public class HaplotypeCaller extends ActiveRegionWalker, In VCFConstants.GENOTYPE_PL_KEY); if ( ! doNotRunPhysicalPhasing ) { - headerInfo.add(new VCFFormatHeaderLine(HAPLOTYPE_CALLER_PHASING_ID_KEY, 1, VCFHeaderLineType.String, "Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group")); - headerInfo.add(new VCFFormatHeaderLine(HAPLOTYPE_CALLER_PHASING_GT_KEY, 1, VCFHeaderLineType.String, "Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another")); + headerInfo.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_ID_KEY)); + headerInfo.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_GT_KEY)); } // FILTER fields are added unconditionally as it's not always 100% certain the circumstances // where the filters are used. For example, in emitting all sites the lowQual field is used - headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotypingEngine.LOW_QUAL_FILTER_NAME, "Low quality")); + headerInfo.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.LOW_QUAL_FILTER_NAME)); initializeReferenceConfidenceModel(samplesList, headerInfo); vcfWriter.writeHeader(new VCFHeader(headerInfo, sampleSet)); + //now that we have all the VCF headers, initialize the annotations (this is particularly important to turn off RankSumTest dithering in integration tests) + annotationEngine.invokeAnnotationInitializationMethods(headerInfo); + try { // fasta reference reader to supplement the edges of the reference sequence referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); @@ -810,30 +634,30 @@ public class HaplotypeCaller extends ActiveRegionWalker, In } // create and setup the assembler - assemblyEngine = new ReadThreadingAssembler(maxNumHaplotypesInPopulation, kmerSizes, dontIncreaseKmerSizesForCycles, allowNonUniqueKmersInRef, numPruningSamples); + assemblyEngine = new ReadThreadingAssembler(RTAC.maxNumHaplotypesInPopulation, RTAC.kmerSizes, RTAC.dontIncreaseKmerSizesForCycles, RTAC.allowNonUniqueKmersInRef, RTAC.numPruningSamples); - assemblyEngine.setErrorCorrectKmers(errorCorrectKmers); - assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR); - assemblyEngine.setDebug(SCAC.DEBUG); - assemblyEngine.setDebugGraphTransformations(debugGraphTransformations); - assemblyEngine.setAllowCyclesInKmerGraphToGeneratePaths(allowCyclesInKmerGraphToGeneratePaths); - assemblyEngine.setRecoverDanglingBranches(!doNotRecoverDanglingBranches); - assemblyEngine.setMinDanglingBranchLength(minDanglingBranchLength); + assemblyEngine.setErrorCorrectKmers(RTAC.errorCorrectKmers); + assemblyEngine.setPruneFactor(RTAC.MIN_PRUNE_FACTOR); + assemblyEngine.setDebug(HCAC.DEBUG); + assemblyEngine.setDebugGraphTransformations(RTAC.debugGraphTransformations); + assemblyEngine.setAllowCyclesInKmerGraphToGeneratePaths(RTAC.allowCyclesInKmerGraphToGeneratePaths); + assemblyEngine.setRecoverDanglingBranches(!RTAC.doNotRecoverDanglingBranches); + assemblyEngine.setMinDanglingBranchLength(RTAC.minDanglingBranchLength); assemblyEngine.setMinBaseQualityToUseInAssembly(MIN_BASE_QUALTY_SCORE); MIN_TAIL_QUALITY = (byte)(MIN_BASE_QUALTY_SCORE - 1); - if ( graphWriter != null ) assemblyEngine.setGraphWriter(graphWriter); + if ( RTAC.graphWriter != null ) assemblyEngine.setGraphWriter(RTAC.graphWriter); // setup the likelihood calculation engine - if ( phredScaledGlobalReadMismappingRate < 0 ) phredScaledGlobalReadMismappingRate = -1; + if ( LEAC.phredScaledGlobalReadMismappingRate < 0 ) LEAC.phredScaledGlobalReadMismappingRate = -1; // configure the global mismapping rate - if ( phredScaledGlobalReadMismappingRate < 0 ) { + if ( LEAC.phredScaledGlobalReadMismappingRate < 0 ) { log10GlobalReadMismappingRate = - Double.MAX_VALUE; } else { - log10GlobalReadMismappingRate = QualityUtils.qualToErrorProbLog10(phredScaledGlobalReadMismappingRate); - logger.info("Using global mismapping rate of " + phredScaledGlobalReadMismappingRate + " => " + log10GlobalReadMismappingRate + " in log10 likelihood units"); + log10GlobalReadMismappingRate = QualityUtils.qualToErrorProbLog10(LEAC.phredScaledGlobalReadMismappingRate); + logger.info("Using global mismapping rate of " + LEAC.phredScaledGlobalReadMismappingRate + " => " + log10GlobalReadMismappingRate + " in log10 likelihood units"); } //static member function - set number of threads @@ -841,21 +665,21 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // create our likelihood calculation engine likelihoodCalculationEngine = createLikelihoodCalculationEngine(); - final MergeVariantsAcrossHaplotypes variantMerger = mergeVariantsViaLD ? new LDMerger(SCAC.DEBUG, 10, 1) : new MergeVariantsAcrossHaplotypes(); + final MergeVariantsAcrossHaplotypes variantMerger = new MergeVariantsAcrossHaplotypes(); genotypingEngine.setCrossHaplotypeEventMerger(variantMerger); genotypingEngine.setAnnotationEngine(annotationEngine); - if ( bamWriter != null ) { + if ( HCAC.bamWriter != null ) { // we currently do not support multi-threaded BAM writing, so exception out if ( getToolkit().getTotalNumberOfThreads() > 1 ) throw new UserException.BadArgumentValue("bamout", "Currently cannot emit a BAM file from the HaplotypeCaller in multi-threaded mode."); - haplotypeBAMWriter = HaplotypeBAMWriter.create(bamWriterType, bamWriter, getToolkit().getSAMFileHeader()); + haplotypeBAMWriter = HaplotypeBAMWriter.create(HCAC.bamWriterType, HCAC.bamWriter, getToolkit().getSAMFileHeader()); } - trimmer.initialize(getToolkit().getGenomeLocParser(), SCAC.DEBUG, - SCAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES,emitReferenceConfidence()); + trimmer.initialize(getToolkit().getGenomeLocParser(), HCAC.DEBUG, + HCAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES,emitReferenceConfidence()); } private void initializeReferenceConfidenceModel(final SampleList samples, final Set headerInfo) { @@ -864,15 +688,15 @@ public class HaplotypeCaller extends ActiveRegionWalker, In if ( samples.sampleCount() != 1 ) throw new UserException.BadArgumentValue("emitRefConfidence", "Can only be used in single sample mode currently. Use the sample_name argument to run on a single sample out of a multi-sample BAM file."); headerInfo.addAll(referenceConfidenceModel.getVCFHeaderLines()); - if ( SCAC.emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) { - // a kluge to enforce the use of this indexing strategy - if (getToolkit().getArguments().variant_index_type != OPTIMAL_GVCF_INDEX_TYPE || - getToolkit().getArguments().variant_index_parameter != OPTIMAL_GVCF_INDEX_PARAMETER) { - throw new UserException.GVCFIndexException(OPTIMAL_GVCF_INDEX_TYPE, OPTIMAL_GVCF_INDEX_PARAMETER); + if ( HCAC.emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) { + // A kluge to enforce the use of this indexing strategy - must set the gVCF indexing values if not a using a gVCF output file . + // An output gVCF file automatically sets the indexing values because it has the .g.vcf extension. + if (!GATKVCFUtils.usingGVCFIndexingArguments(getToolkit().getArguments().variant_index_type, getToolkit().getArguments().variant_index_parameter) && !isGVCF()) { + throw new UserException.GVCFIndexException(GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); } try { - vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands,SCAC.genotypeArgs.samplePloidy); + vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands, HCAC.genotypeArgs.samplePloidy); } catch ( IllegalArgumentException e ) { throw new UserException.BadArgumentValue("GQBands", "are malformed: " + e.getMessage()); } @@ -888,9 +712,9 @@ public class HaplotypeCaller extends ActiveRegionWalker, In private ReadLikelihoodCalculationEngine createLikelihoodCalculationEngine() { switch (likelihoodEngineImplementation) { case PairHMM: - return new PairHMMLikelihoodCalculationEngine( (byte)gcpHMM, pairHMM, log10GlobalReadMismappingRate, noFpga, pcrErrorModel ); + return new PairHMMLikelihoodCalculationEngine( (byte) LEAC.gcpHMM, LEAC.pairHMM, LEAC.pairHMMSub, LEAC.alwaysLoadVectorLoglessPairHMMLib, log10GlobalReadMismappingRate, LEAC.noFpga, pcrErrorModel ); case GraphBased: - return new GraphBasedLikelihoodCalculationEngine( (byte)gcpHMM,log10GlobalReadMismappingRate, heterogeneousKmerSizeResolution,SCAC.DEBUG,debugGraphTransformations); + return new GraphBasedLikelihoodCalculationEngine( (byte) LEAC.gcpHMM,log10GlobalReadMismappingRate, heterogeneousKmerSizeResolution, HCAC.DEBUG, RTAC.debugGraphTransformations); case Random: return new RandomLikelihoodCalculationEngine(); default: @@ -926,15 +750,15 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Ensures({"result.isActiveProb >= 0.0", "result.isActiveProb <= 1.0"}) public ActivityProfileState isActive( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) { - if( SCAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES ) { - final VariantContext vcFromAllelesRod = GenotypingGivenAllelesUtils.composeGivenAllelesVariantContextFromRod(tracker, ref.getLocus(), false, logger, SCAC.alleles); + if( HCAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES ) { + final VariantContext vcFromAllelesRod = GenotypingGivenAllelesUtils.composeGivenAllelesVariantContextFromRod(tracker, ref.getLocus(), false, logger, HCAC.alleles); if( vcFromAllelesRod != null ) { return new ActivityProfileState(ref.getLocus(), 1.0); } } if( USE_ALLELES_TRIGGER ) { - return new ActivityProfileState( ref.getLocus(), tracker.getValues(SCAC.alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 ); + return new ActivityProfileState( ref.getLocus(), tracker.getValues(HCAC.alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 ); } if( context == null || context.getBasePileup().isEmpty() ) @@ -983,8 +807,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In return referenceModelForNoVariation(originalActiveRegion, true); final List givenAlleles = new ArrayList<>(); - if( SCAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES ) { - for ( final VariantContext vc : metaDataTracker.getValues(SCAC.alleles) ) { + if( HCAC.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES ) { + for ( final VariantContext vc : metaDataTracker.getValues(HCAC.alleles) ) { if ( vc.isNotFiltered() ) { givenAlleles.add(vc); // do something with these VCs during GGA mode } @@ -1006,7 +830,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In final ActiveRegionTrimmer.Result trimmingResult = trimmer.trim(originalActiveRegion,allVariationEvents); - if (!trimmingResult.isVariationPresent() && !disableOptimizations) + if (!trimmingResult.isVariationPresent() && !HCAC.disableOptimizations) return referenceModelForNoVariation(originalActiveRegion,false); final AssemblyResultSet assemblyResult = @@ -1024,7 +848,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // abort early if something is out of the acceptable range // TODO is this ever true at this point??? perhaps GGA. Need to check. - if( ! assemblyResult.isVariationPresent() && ! disableOptimizations) + if( ! assemblyResult.isVariationPresent() && ! HCAC.disableOptimizations) return referenceModelForNoVariation(originalActiveRegion, false); // For sure this is not true if gVCF is on. @@ -1032,7 +856,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // TODO is this ever true at this point??? perhaps GGA. Need to check. - if( regionForGenotyping.size() == 0 && ! disableOptimizations) { + if( regionForGenotyping.size() == 0 && ! HCAC.disableOptimizations) { // no reads remain after filtering so nothing else to do! return referenceModelForNoVariation(originalActiveRegion, false); } @@ -1047,7 +871,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In likelihoodCalculationEngine.computeReadLikelihoods(assemblyResult,samplesList,reads); // Realign reads to their best haplotype. - final Map readRealignments = realignReadsToTheirBestHaplotype(readLikelihoods, assemblyResult.getPaddedReferenceLoc()); + final Map readRealignments = realignReadsToTheirBestHaplotype(readLikelihoods, assemblyResult.getReferenceHaplotype(), assemblyResult.getPaddedReferenceLoc()); readLikelihoods.changeReads(readRealignments); // Note: we used to subset down at this point to only the "best" haplotypes in all samples for genotyping, but there @@ -1065,12 +889,12 @@ public class HaplotypeCaller extends ActiveRegionWalker, In regionForGenotyping.getLocation(), getToolkit().getGenomeLocParser(), metaDataTracker, - (consensusMode ? Collections.emptyList() : givenAlleles), + (RTAC.consensusMode ? Collections.emptyList() : givenAlleles), emitReferenceConfidence()); - if ( bamWriter != null ) { + if ( HCAC.bamWriter != null ) { final Set calledHaplotypeSet = new HashSet<>(calledHaplotypes.getCalledHaplotypes()); - if (disableOptimizations) + if (HCAC.disableOptimizations) calledHaplotypeSet.add(assemblyResult.getReferenceHaplotype()); haplotypeBAMWriter.writeReadsAlignedToHaplotypes( haplotypes, @@ -1080,7 +904,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In readLikelihoods); } - if( SCAC.DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } + if( HCAC.DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } if ( emitReferenceConfidence() ) { @@ -1112,7 +936,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In *

        * @return never {@code null} */ - private Map realignReadsToTheirBestHaplotype(final ReadLikelihoods originalReadLikelihoods, final GenomeLoc paddedReferenceLoc) { + private Map realignReadsToTheirBestHaplotype(final ReadLikelihoods originalReadLikelihoods, final Haplotype refHaplotype, final GenomeLoc paddedReferenceLoc) { final Collection.BestAllele> bestAlleles = originalReadLikelihoods.bestAlleles(); final Map result = new HashMap<>(bestAlleles.size()); @@ -1121,7 +945,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In final GATKSAMRecord originalRead = bestAllele.read; final Haplotype bestHaplotype = bestAllele.allele; final boolean isInformative = bestAllele.isInformative(); - final GATKSAMRecord realignedRead = AlignmentUtils.createReadAlignedToRef(originalRead,bestHaplotype,paddedReferenceLoc.getStart(),isInformative); + final GATKSAMRecord realignedRead = AlignmentUtils.createReadAlignedToRef(originalRead, bestHaplotype, refHaplotype, paddedReferenceLoc.getStart(), isInformative); result.put(originalRead,realignedRead); } return result; @@ -1149,6 +973,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In protected AssemblyResultSet assembleReads(final ActiveRegion activeRegion, final List giveAlleles) { // Create the reference haplotype which is the bases from the reference that make up the active region finalizeActiveRegion(activeRegion); // handle overlapping fragments, clip adapter and low qual tails + if( HCAC.DEBUG ) { logger.info("Assembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING); final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion); @@ -1157,7 +982,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // Create ReadErrorCorrector object if requested - will be used within assembly engine. ReadErrorCorrector readErrorCorrector = null; if (errorCorrectReads) - readErrorCorrector = new ReadErrorCorrector(kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, minObservationsForKmerToBeSolid, SCAC.DEBUG, fullReferenceWithPadding); + readErrorCorrector = new ReadErrorCorrector(RTAC.kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, RTAC.minObservationsForKmerToBeSolid, HCAC.DEBUG, fullReferenceWithPadding); try { final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, giveAlleles,readErrorCorrector ); @@ -1167,7 +992,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In } catch ( final Exception e ) { // Capture any exception that might be thrown, and write out the assembly failure BAM if requested if ( captureAssemblyFailureBAM ) { - final SAMFileWriter writer = ReadUtils.createSAMFileWriter("assemblyFailure.bam", getToolkit()); + final SAMFileWriter writer = SAMFileWriterStub.createSAMFileWriter("assemblyFailure.bam", getToolkit()); + new DirectOutputTracker().addOutput((SAMFileWriterStub) writer); for ( final GATKSAMRecord read : activeRegion.getReads() ) { writer.addAlignment(read); } @@ -1249,7 +1075,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Override public void onTraversalDone(Integer result) { - if ( SCAC.emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it + if ( HCAC.emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it referenceConfidenceModel.close(); //TODO remove the need to call close here for debugging, the likelihood output stream should be managed //TODO (open & close) at the walker, not the engine. @@ -1266,8 +1092,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In private void finalizeActiveRegion( final ActiveRegion activeRegion ) { if (activeRegion.isFinalized()) return; - if( SCAC.DEBUG ) { logger.info("Assembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } - // Loop through the reads hard clipping the adaptor and low quality tails final List readsToUse = new ArrayList<>(activeRegion.getReads().size()); for( final GATKSAMRecord myRead : activeRegion.getReads() ) { @@ -1332,7 +1156,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In return splitReadsBySample(samplesList, reads); } - private static Map> splitReadsBySample( final SampleList samplesList, final Collection reads ) { + public static Map> splitReadsBySample( final SampleList samplesList, final Collection reads ) { final Map> returnMap = new HashMap<>(); final int sampleCount = samplesList.sampleCount(); for (int i = 0; i < sampleCount; i++) @@ -1350,7 +1174,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In * @return true if HC must emit reference confidence. */ private boolean emitReferenceConfidence() { - return SCAC.emitReferenceConfidence != ReferenceConfidenceMode.NONE; + return HCAC.emitReferenceConfidence != ReferenceConfidenceMode.NONE; } /** @@ -1376,4 +1200,13 @@ public class HaplotypeCaller extends ActiveRegionWalker, In activeRegion.removeAll( readsToRemove ); } + + /** + * Is writing to an output GVCF file? + * + * @return true if the VCF output file has a .g.vcf extension + */ + private boolean isGVCF() { + return ((VariantContextWriterStub) vcfWriter).getOutputFile().getName().endsWith("." + GATKVCFUtils.GVCF_EXT); + } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java index c5d0073f2..1a4b4af39 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java @@ -51,38 +51,11 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; -import org.broadinstitute.gatk.engine.arguments.StandardCallerArgumentCollection; -import org.broadinstitute.gatk.utils.commandline.Advanced; -import org.broadinstitute.gatk.utils.commandline.Argument; - /** * Set of arguments for the {@link HaplotypeCaller} * * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> */ -public class HaplotypeCallerArgumentCollection extends StandardCallerArgumentCollection { - - @Advanced - @Argument(fullName="debug", shortName="debug", doc="Print out very verbose debug information about each triggering active region", required = false) - protected boolean DEBUG; - - @Advanced - @Argument(fullName="useFilteredReadsForAnnotations", shortName="useFilteredReadsForAnnotations", doc = "Use the contamination-filtered read maps for the purposes of annotating variants", required=false) - protected boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = false; - - /** - * The reference confidence mode makes it possible to emit a per-bp or summarized confidence estimate for a site being strictly homozygous-reference. - * See http://www.broadinstitute.org/gatk/guide/article?id=2940 for more details of how this works. - * Note that if you set -ERC GVCF, you also need to set -variant_index_type LINEAR and -variant_index_parameter 128000 (with those exact values!). - * This requirement is a temporary workaround for an issue with index compression. - */ - @Advanced - @Argument(fullName="emitRefConfidence", shortName="ERC", doc="Mode for emitting reference confidence scores", required = false) - protected ReferenceConfidenceMode emitReferenceConfidence = ReferenceConfidenceMode.NONE; - - @Override - public HaplotypeCallerArgumentCollection clone() { - return (HaplotypeCallerArgumentCollection) super.clone(); - } +public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgumentCollection { } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java index 5481cfdeb..0f6d61ab6 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java @@ -54,7 +54,11 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import htsjdk.variant.variantcontext.*; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.genotyper.AlleleList; +import org.broadinstitute.gatk.utils.genotyper.IndexedAlleleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.genotyper.*; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvider; import org.broadinstitute.gatk.utils.GenomeLoc; @@ -64,8 +68,8 @@ import org.broadinstitute.gatk.utils.collections.Pair; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.EventMap; import org.broadinstitute.gatk.utils.haplotype.Haplotype; -import org.broadinstitute.gatk.utils.haplotype.MergeVariantsAcrossHaplotypes; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import java.util.*; @@ -73,9 +77,9 @@ import java.util.*; /** * {@link HaplotypeCaller}'s genotyping strategy implementation. */ -public class HaplotypeCallerGenotypingEngine extends GenotypingEngine { +public class HaplotypeCallerGenotypingEngine extends GenotypingEngine { - private static final int ALLELE_EXTENSION = 2; + protected static final int ALLELE_EXTENSION = 2; private static final String phase01 = "0|1"; private static final String phase10 = "1|0"; @@ -94,7 +98,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine calls; private final Set calledHaplotypes; - protected CalledHaplotypes(final List calls, final Set calledHaplotypes) { + public CalledHaplotypes(final List calls, final Set calledHaplotypes) { if ( calls == null ) throw new IllegalArgumentException("calls cannot be null"); if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); if ( Utils.xor(calls.isEmpty(), calledHaplotypes.isEmpty()) ) @@ -260,7 +264,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine phasedGenotypes = new ArrayList<>(); for ( final Genotype g : vc.getGenotypes() ) - phasedGenotypes.add(new GenotypeBuilder(g).attribute(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_ID_KEY, ID).attribute(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_GT_KEY, phaseGT).make()); + phasedGenotypes.add(new GenotypeBuilder(g).attribute(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_ID_KEY, ID).attribute(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_GT_KEY, phaseGT).make()); return new VariantContextBuilder(vc).genotypes(phasedGenotypes).make(); } @@ -521,14 +526,14 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine originalList = mergedVC.getAlleles(); final List alleleList = new ArrayList<>(originalList.size() + 1); alleleList.addAll(mergedVC.getAlleles()); - alleleList.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + alleleList.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); vcb.alleles(alleleList); return vcb.make(); } // Builds the read-likelihoods collection to use for annotation considering user arguments and the collection // used for genotyping. - private ReadLikelihoods prepareReadAlleleLikelihoodsForAnnotation( + protected ReadLikelihoods prepareReadAlleleLikelihoodsForAnnotation( final ReadLikelihoods readHaplotypeLikelihoods, final Map> perSampleFilteredReadList, final GenomeLocParser genomeLocParser, @@ -550,7 +555,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine decomposeHaplotypesIntoVariantContexts(final List haplotypes, + protected TreeSet decomposeHaplotypesIntoVariantContexts(final List haplotypes, final ReadLikelihoods readLikelihoods, final byte[] ref, final GenomeLoc refLoc, @@ -625,13 +630,13 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine makePriorityList(final List vcs) { + protected List makePriorityList(final List vcs) { final List priorityList = new LinkedList<>(); for ( final VariantContext vc : vcs ) priorityList.add(vc.getSource()); return priorityList; } - private List getVCsAtThisLocation(final List haplotypes, + protected List getVCsAtThisLocation(final List haplotypes, final int loc, final List activeAllelesToGenotype) { // the overlapping events to merge into a common reference view @@ -684,7 +689,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine readLikelihoods, final VariantContext mergedVC, final List noCallAlleles ) { + protected GenotypesContext calculateGLsForThisEvent( final ReadLikelihoods readLikelihoods, final VariantContext mergedVC, final List noCallAlleles ) { final List vcAlleles = mergedVC.getAlleles(); final AlleleList alleleList = readLikelihoods.alleleCount() == vcAlleles.size() ? readLikelihoods : new IndexedAlleleList<>(vcAlleles); final GenotypingLikelihoods likelihoods = genotypingModel.calculateLikelihoods(alleleList,new GenotypingData<>(ploidyModel,readLikelihoods)); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeLDCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeLDCalculator.java new file mode 100644 index 000000000..cf202bf5f --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeLDCalculator.java @@ -0,0 +1,204 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.haplotypecaller; + +import com.google.java.contract.Requires; +import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.utils.genotyper.AlleleList; +import org.broadinstitute.gatk.utils.genotyper.AlleleListUtils; +import org.broadinstitute.gatk.utils.genotyper.SampleListUtils; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; +import org.broadinstitute.gatk.utils.haplotype.Haplotype; + +import java.util.*; + +/** + * Computes the likelihood based probability that haplotypes for first and second variant contexts + * only appear in their fully linked form (x11 and x22) given a set of haplotypes where they might occur + * and read likelihoods per sample + * + * User: depristo + * Date: 3/29/13 + * Time: 9:23 AM + */ +public class HaplotypeLDCalculator { + private final List haplotypes; + private final ReadLikelihoods readLikelihoods; + private List> haplotypeLikelihoodsPerSample = null; + + // linear contigency table with table[0] == [0][0], table[1] = [0][1], table[2] = [1][0], table[3] = [1][1] + private final double[] table = new double[4]; + + /** + * For testing + */ + @SuppressWarnings("unchecked") + protected HaplotypeLDCalculator() { + haplotypes = Collections.emptyList(); + final AlleleList alleleList = AlleleListUtils.emptyList(); + readLikelihoods = new ReadLikelihoods<>(SampleListUtils.emptyList(), + alleleList, Collections.EMPTY_MAP); + } + + public HaplotypeLDCalculator(final List haplotypes, final ReadLikelihoods haplotypeReadMap) { + this.haplotypes = haplotypes; + this.readLikelihoods = haplotypeReadMap; + } + + /** + * Construct the cached list of summed haplotype likelihoods per sample if it + * hasn't already been computed. This data structure is lazy created but only + * needs to be made once when we make 1 merge decision as the data doesn't change + * no matter how many calls to computeProbOfBeingPhased + */ + private void buildHaplotypeLikelihoodsPerSampleIfNecessary() { + if ( haplotypeLikelihoodsPerSample == null ) { + // do the lazy computation + final Set samples = new LinkedHashSet<>(readLikelihoods.samples()); + haplotypeLikelihoodsPerSample = new LinkedList<>(); + for( final String sample : samples ) { + final Map map = new HashMap<>(haplotypes.size()); + for( final Haplotype h : haplotypes ) { + // count up the co-occurrences of the events for the R^2 calculation + final double haplotypeLikelihood = PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, readLikelihoods, Collections.singletonList(h), false)[0][0]; + map.put(h, haplotypeLikelihood); + } + haplotypeLikelihoodsPerSample.add(map); + } + } + } + + /** + * Compute the likelihood based probability that that haplotypes for first and second are only x11 and x22 + * + * As opposed to the hypothesis that all four haplotypes (x11, x12, x21, and x22) exist in the population + * + * @param first a non-null VariantContext + * @param second a non-null VariantContext + * @return the probability that only x11 and x22 exist among the samples + */ + protected double computeProbOfBeingPhased(final VariantContext first, final VariantContext second) { + buildHaplotypeLikelihoodsPerSampleIfNecessary(); + + Arrays.fill(table, Double.NEGATIVE_INFINITY); + + for ( final Map entry : haplotypeLikelihoodsPerSample ) { + for ( final Map.Entry haplotypeLikelihood : entry.entrySet() ) { + final Haplotype h = haplotypeLikelihood.getKey(); + // count up the co-occurrences of the events for the R^2 calculation + final VariantContext thisHapVC = h.getEventMap().get(first.getStart()); + final VariantContext nextHapVC = h.getEventMap().get(second.getStart()); // TODO -- add function to take a VC + final int i = thisHapVC == null ? 0 : 1; + final int j = nextHapVC == null ? 0 : 1; + final int index = 2 * i + j; + table[index] = MathUtils.approximateLog10SumLog10(table[index], haplotypeLikelihood.getValue()); + } + } + + return pPhased(table); + } + + /** + * Compute probability that two variants are in phase with each other and that no + * compound hets exist in the population. + * + * Implemented as a likelihood ratio test of the hypothesis: + * + * x11 and x22 are the only haplotypes in the populations + * + * vs. + * + * all four haplotype combinations (x11, x12, x21, and x22) all exist in the population. + * + * Now, since we have to have both variants in the population, we exclude the x11 & x11 state. So the + * p of having just x11 and x22 is P(x11 & x22) + p(x22 & x22). + * + * Alternatively, we might have any configuration that gives us both 1 and 2 alts, which are: + * + * - P(x11 & x12 & x21) -- we have hom-ref and both hets + * - P(x22 & x12 & x21) -- we have hom-alt and both hets + * - P(x22 & x12) -- one haplotype is 22 and the other is het 12 + * - P(x22 & x21) -- one haplotype is 22 and the other is het 21 + * + * The probability is just p11_22 / (p11_22 + p hets) + * + * @param table linear contigency table with table[0] == [0][0], table[1] = [0][1], table[2] = [1][0], table[3] = [1][1] + * doesn't have to be normalized as this function does the normalization internally + * @return the real space probability that the data is phased + */ + @Requires("table.length == 4") + protected double pPhased( double[] table ) { + final double[] normTable = MathUtils.normalizeFromLog10(table, true); + + final double x11 = normTable[0], x12 = normTable[1], x21 = normTable[2], x22 = normTable[3]; + + // probability that we are only x11 && x22 + final double p11_22 = MathUtils.approximateLog10SumLog10(x11 + x22, x22 + x22); + + // probability of having any of the other pairs + final double p11_12_21 = MathUtils.approximateLog10SumLog10(x11 + x12, x11 + x21, x12 + x21); + final double p22_12_21 = MathUtils.approximateLog10SumLog10(x22 + x12, x22 + x21, x12 + x21); + final double p22_12 = x22 + x12; + final double p22_21 = x22 + x21; + final double pOthers = MathUtils.approximateLog10SumLog10(new double[]{p11_12_21, p22_12_21, p22_12, p22_21}); + + // probability of being phases is the ratio of p11_22 / pOthers which in log space is just a substraction + final double log10phased = p11_22 - (MathUtils.approximateLog10SumLog10(p11_22, pOthers)); + + return Math.pow(10.0, log10phased); + } + + protected double pPhasedTest( final double x11, final double x12, final double x21, final double x22 ) { + return pPhased(new double[]{x11, x12, x21, x22}); + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeResolver.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeResolver.java index db80ac196..8bf383240 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeResolver.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeResolver.java @@ -56,9 +56,9 @@ import org.broadinstitute.gatk.utils.commandline.Input; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.utils.commandline.RodBinding; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.Reference; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.engine.walkers.Window; @@ -82,16 +82,17 @@ import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory; import java.util.*; /** - * Haplotype-based resolution of variants in 2 different eval files. + * Haplotype-based resolution of variants in separate callsets. * *

        - * HaplotypeResolver is a tool that takes 2 VCF files and constructs haplotypes based on the variants inside them. + * HaplotypeResolver is a tool that takes two VCF files and constructs haplotypes based on the variants inside them. * From that, it can resolve potential differences in variant calls that are inherently the same (or similar) variants. * Records are annotated with the set and status attributes. + *

        * *

        Input

        *

        - * 2 variant files to resolve. + * Two variant files to resolve. *

        * *

        Output

        @@ -99,11 +100,11 @@ import java.util.*; * A single consensus VCF. *

        * - *

        Examples

        + *

        Usage example

        *
        - * java -Xmx1g -jar GenomeAnalysisTK.jar \
        - *   -R ref.fasta \
        + * java -jar GenomeAnalysisTK.jar \
          *   -T HaplotypeResolver \
        + *   -R reference.fasta \
          *   -V:v1 input1.vcf \
          *   -V:v2 input2.vcf \
          *   -o output.vcf
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/LDMerger.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/LDMerger.java
        new file mode 100644
        index 000000000..7d17e7502
        --- /dev/null
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/LDMerger.java
        @@ -0,0 +1,314 @@
        +/*
        +* By downloading the PROGRAM you agree to the following terms of use:
        +* 
        +* BROAD INSTITUTE
        +* SOFTWARE LICENSE AGREEMENT
        +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
        +* 
        +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
        +* 
        +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
        +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
        +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
        +* 
        +* 1. DEFINITIONS
        +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
        +* 
        +* 2. LICENSE
        +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute.  LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
        +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
        +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
        +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
        +* 
        +* 3. PHONE-HOME FEATURE
        +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM.  Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time.  Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
        +* 
        +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
        +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
        +* Copyright 2012-2014 Broad Institute, Inc.
        +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
        +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
        +* 
        +* 5. INDEMNIFICATION
        +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
        +* 
        +* 6. NO REPRESENTATIONS OR WARRANTIES
        +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
        +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
        +* 
        +* 7. ASSIGNMENT
        +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
        +* 
        +* 8. MISCELLANEOUS
        +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
        +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
        +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
        +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
        +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
        +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
        +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
        +*/
        +
        +package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
        +
        +import org.apache.commons.lang.ArrayUtils;
        +import org.apache.log4j.Logger;
        +import org.broadinstitute.gatk.utils.GenomeLoc;
        +import htsjdk.variant.variantcontext.Allele;
        +import htsjdk.variant.variantcontext.VariantContext;
        +import htsjdk.variant.variantcontext.VariantContextBuilder;
        +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
        +import org.broadinstitute.gatk.utils.haplotype.Haplotype;
        +
        +import java.util.Arrays;
        +import java.util.Iterator;
        +import java.util.List;
        +import java.util.TreeSet;
        +
        +/**
        + * Merges VariantContexts in a series of haplotypes according to their pairwise LD
        + *
        + * User: depristo
        + * Date: 3/28/13
        + * Time: 6:17 PM
        + */
        +public class LDMerger extends MergeVariantsAcrossHaplotypes {
        +    private final static Logger logger = Logger.getLogger(LDMerger.class);
        +
        +    private final boolean DEBUG;
        +    private final int minSamplesToMergeSNPs;
        +    private final int minSamplesToMergeOtherEvents;
        +
        +    public LDMerger(boolean DEBUG, int minSamplesToMergeSNPs, int minSamplesToMergeOtherEvents) {
        +        super();
        +        this.DEBUG = DEBUG;
        +        this.minSamplesToMergeSNPs = minSamplesToMergeSNPs;
        +        this.minSamplesToMergeOtherEvents = minSamplesToMergeOtherEvents;
        +    }
        +
        +    protected LDMerger() {
        +        this(false, 1, 1);
        +    }
        +
        +    // TODO -- should be class arguments and static variables in HC
        +    protected final static int MAX_DISTANCE_BETWEEN_SNPS_TO_MERGE = 6;
        +    protected final static int MAX_DISTANCE_BETWEEN_OTHER_EVENTS_TO_MERGE = 25;
        +
        +    /**
        +     * We require 99% confidence that only the phased haplotypes exist in the population to merge the records
        +     */
        +    protected final static double MERGE_EVENTS_PROB_PHASED_THRESHOLD = 0.99;
        +
        +    /**
        +     * Merge as many events among the haplotypes as possible based on pairwise LD among variants
        +     *
        +     * @param haplotypes a list of haplotypes whose events we want to merge
        +     * @param readLikelihoods map from sample name -> read likelihoods for each haplotype
        +     * @param startPosKeySet a set of starting positions of all events among the haplotypes
        +     * @param ref the reference bases
        +     * @param refLoc the span of the reference bases
        +     */
        +    @Override
        +    public boolean merge( final List haplotypes,
        +                          final ReadLikelihoods readLikelihoods,
        +                          final TreeSet startPosKeySet,
        +                          final byte[] ref,
        +                          final GenomeLoc refLoc ) {
        +        if ( haplotypes == null ) throw new IllegalArgumentException("haplotypes cannot be null");
        +        if ( readLikelihoods == null ) throw new IllegalArgumentException("readLikelihoods cannot be null");
        +        if ( startPosKeySet == null ) throw new IllegalArgumentException("startPosKeySet cannot be null");
        +        if ( ref == null ) throw new IllegalArgumentException("ref cannot be null");
        +        if ( refLoc == null ) throw new IllegalArgumentException("refLoc cannot be null");
        +        if ( refLoc.size() != ref.length ) throw new IllegalArgumentException("refLoc size " + refLoc.size() + " != ref.length " + ref.length + " at " + refLoc);
        +
        +        if( startPosKeySet.size() <= 1 ) { return false; }
        +
        +        final int nSamples = readLikelihoods.sampleCount();
        +        final HaplotypeLDCalculator r2Calculator = new HaplotypeLDCalculator(haplotypes, readLikelihoods);
        +        boolean somethingWasMerged = false;
        +        boolean mapWasUpdated = true;
        +        while( mapWasUpdated ) {
        +            mapWasUpdated = mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calculator, nSamples, startPosKeySet, ref, refLoc);
        +            somethingWasMerged |= mapWasUpdated;
        +        }
        +        return somethingWasMerged;
        +    }
        +
        +    /**
        +     * Merge the next pair of events, if possible
        +     *
        +     * @param haplotypes a list of haplotypes whose events we want to merge
        +     * @param ldCalculator calculates R^2 for pairs of events on demand
        +     * @param startPosKeySet a set of starting positions of all events among the haplotypes
        +     * @param ref the reference bases
        +     * @param refLoc the span of the reference bases
        +     * @return true if something was merged, false otherwise
        +     */
        +    protected boolean mergeConsecutiveEventsBasedOnLDOnce( final List haplotypes,
        +                                                           final HaplotypeLDCalculator ldCalculator,
        +                                                           final int nSamples,
        +                                                           final TreeSet startPosKeySet,
        +                                                           final byte[] ref,
        +                                                           final GenomeLoc refLoc ) {
        +        // loop over the set of start locations and consider pairs that start near each other
        +        final Iterator iter = startPosKeySet.iterator();
        +        int thisStart = iter.next();
        +        while( iter.hasNext() ) {
        +            final int nextStart = iter.next();
        +            final LDMergeData toMerge = getPairOfEventsToMerge(haplotypes, thisStart, nextStart);
        +
        +            if ( toMerge.canBeMerged(nSamples) ) {
        +                final double pPhased = ldCalculator.computeProbOfBeingPhased(toMerge.firstVC, toMerge.secondVC);
        +
        +                if( DEBUG ) {
        +                    logger.info("Found consecutive biallelic events with R^2 = " + String.format("%.4f", pPhased));
        +                    logger.info("-- " + toMerge.firstVC);
        +                    logger.info("-- " + toMerge.secondVC);
        +                }
        +
        +                if( pPhased > MERGE_EVENTS_PROB_PHASED_THRESHOLD) {
        +                    final VariantContext mergedVC = createMergedVariantContext(toMerge.firstVC, toMerge.secondVC, ref, refLoc);
        +                    // if for some reason the merging resulting in a bad allele, mergedVC will be null, and we will just remove first and second
        +                    replaceVariantContextsInMap(haplotypes, startPosKeySet, mergedVC, toMerge.firstVC, toMerge.secondVC);
        +                    return true; // break out of tree set iteration since it was just updated, start over from the beginning and keep merging events
        +                }
        +            }
        +
        +            thisStart = nextStart;
        +        }
        +
        +        return false;
        +    }
        +
        +    /**
        +     * Info about potential LD merge of two variant contexts
        +     */
        +    private class LDMergeData {
        +        VariantContext firstVC = null, secondVC = null;
        +        boolean canBeMerged = true;
        +
        +        /** Tell this object that it cant be merged for some reason */
        +        public LDMergeData cantBeMerged() {
        +            canBeMerged = false;
        +            return this;
        +        }
        +
        +        /**
        +         * Can these two events be merged
        +         * @param nSamples the number of samples we're considering
        +         * @return true if we can merge our two variant contexts
        +         */
        +        public boolean canBeMerged(final int nSamples) {
        +            if ( ! canBeMerged || firstVC == null || secondVC == null )
        +                return false;
        +
        +            final int distance = secondVC.getStart() - firstVC.getEnd();
        +            if ( firstVC.isSNP() && secondVC.isSNP() ) {
        +                return nSamples >= minSamplesToMergeSNPs && distance <= MAX_DISTANCE_BETWEEN_SNPS_TO_MERGE;
        +            } else {
        +                return nSamples >= minSamplesToMergeOtherEvents && distance <= MAX_DISTANCE_BETWEEN_OTHER_EVENTS_TO_MERGE;
        +            }
        +        }
        +    }
        +
        +    /**
        +     * Get the information about the potential merge of two events starting at thisStart and nextStart
        +     * @param haplotypes our haplotypes
        +     * @param thisStart the starting position of the first event to merge
        +     * @param nextStart the starting position of the next event to merge
        +     * @return never {@code null}.
        +     */
        +    private LDMergeData getPairOfEventsToMerge(final List haplotypes, final int thisStart, final int nextStart) {
        +        final LDMergeData mergeData = new LDMergeData();
        +
        +        for( final Haplotype h : haplotypes ) {
        +            // only make complex substitutions out of consecutive biallelic sites
        +            final VariantContext thisHapVC = h.getEventMap().get(thisStart);
        +            if( thisHapVC != null && !thisHapVC.isSymbolic() ) { // something was found at this location on this haplotype
        +                if( mergeData.firstVC == null ) {
        +                    mergeData.firstVC = thisHapVC;
        +                } else if( !thisHapVC.hasSameAllelesAs( mergeData.firstVC) ) {
        +                    return mergeData.cantBeMerged();
        +                }
        +            }
        +            final VariantContext nextHapVC = h.getEventMap().get(nextStart);
        +            if( nextHapVC != null && !nextHapVC.isSymbolic() ) { // something was found at the next location on this haplotype
        +                if( mergeData.secondVC == null ) {
        +                    mergeData.secondVC = nextHapVC;
        +                } else if( !nextHapVC.hasSameAllelesAs( mergeData.secondVC) ) {
        +                    return mergeData.cantBeMerged();
        +                }
        +            }
        +        }
        +
        +        // don't try to merge overlapping events
        +        if ( mergeData.firstVC != null && mergeData.secondVC != null && mergeData.firstVC.getEnd() >= mergeData.secondVC.getStart() )
        +            return mergeData.cantBeMerged();
        +
        +        return mergeData;
        +    }
        +
        +    // BUGBUG: make this merge function more general
        +    protected VariantContext createMergedVariantContext( final VariantContext thisVC, final VariantContext nextVC, final byte[] ref, final GenomeLoc refLoc ) {
        +        final int thisStart = thisVC.getStart();
        +        final int nextStart = nextVC.getStart();
        +        byte[] refBases = new byte[]{};
        +        byte[] altBases = new byte[]{};
        +        refBases = ArrayUtils.addAll(refBases, thisVC.getReference().getBases());
        +        altBases = ArrayUtils.addAll(altBases, thisVC.getAlternateAllele(0).getBases());
        +        int locus;
        +        for( locus = thisStart + refBases.length; locus < nextStart; locus++ ) {
        +            final byte refByte = ref[locus - refLoc.getStart()];
        +            refBases = ArrayUtils.add(refBases, refByte);
        +            altBases = ArrayUtils.add(altBases, refByte);
        +        }
        +        refBases = ArrayUtils.addAll(refBases, ArrayUtils.subarray(nextVC.getReference().getBases(), locus > nextStart ? 1 : 0, nextVC.getReference().getBases().length)); // special case of deletion including the padding base of consecutive indel
        +        altBases = ArrayUtils.addAll(altBases, nextVC.getAlternateAllele(0).getBases());
        +
        +        int iii = 0;
        +        if( refBases.length == altBases.length ) { // insertion + deletion of same length creates an MNP --> trim common prefix bases off the beginning of the allele
        +            while( iii < refBases.length && refBases[iii] == altBases[iii] ) { iii++; }
        +            if ( iii == refBases.length ) {
        +                // we've become a null allele, such as with CA/C + A/AA -> CA/CA => after trimming there's nothing left
        +                // so return a null variant context so we can eliminate the variants from consideration
        +                return null;
        +            }
        +        }
        +
        +
        +        final Allele refAllele = Allele.create( ArrayUtils.subarray(refBases, iii, refBases.length), true );
        +        final Allele altAllele =  Allele.create( ArrayUtils.subarray(altBases, iii, altBases.length), false );
        +        return new VariantContextBuilder("merged", thisVC.getChr(), thisVC.getStart() + iii, nextVC.getEnd(), Arrays.asList(refAllele, altAllele)).make();
        +    }
        +
        +    /**
        +     * Update the event maps in all haplotypes to replace a replacement of update1 and 2 with replacement
        +     *
        +     * @param haplotypes the haplotypes whose event maps we need to update
        +     * @param startPosKeySet a sorted set of start positions that we must update
        +     * @param replacement a VariantContext to replace update1 and update2 with.  Can be null, indicating that we just want to remove update1 and update2
        +     * @param update1 the first VC we want to update
        +     * @param update2 the second VC we want to update
        +     */
        +    private void replaceVariantContextsInMap(final List haplotypes,
        +                                             final TreeSet startPosKeySet,
        +                                             final VariantContext replacement,
        +                                             final VariantContext update1, final VariantContext update2) {
        +        // remove the old event from the eventMap on every haplotype and the start pos key set, replace with merged event
        +        for( final Haplotype h : haplotypes ) {
        +            // if we had both events, add replacement.  In some cases the haplotype may not have both
        +            // events but they were still merged because the haplotype isn't a particularly informative
        +            // haplotype in any case.  The order of operations here is important because we are modifying the map
        +            final boolean shouldAdd = h.getEventMap().containsKey(update1.getStart()) && h.getEventMap().containsKey(update2.getStart());
        +            h.getEventMap().remove(update1.getStart());
        +            h.getEventMap().remove(update2.getStart());
        +            if ( shouldAdd && replacement != null ) {
        +                h.getEventMap().addVC(replacement, false); // cannot merge we other events at the same position
        +            }
        +        }
        +
        +        startPosKeySet.remove(update1.getStart());
        +        startPosKeySet.remove(update2.getStart());
        +        if ( replacement != null ) startPosKeySet.add(replacement.getStart());
        +    }
        +}
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/LikelihoodEngineArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/LikelihoodEngineArgumentCollection.java
        new file mode 100644
        index 000000000..f3ec62a31
        --- /dev/null
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/LikelihoodEngineArgumentCollection.java
        @@ -0,0 +1,119 @@
        +/*
        +* By downloading the PROGRAM you agree to the following terms of use:
        +*
        +* BROAD INSTITUTE
        +* SOFTWARE LICENSE AGREEMENT
        +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
        +*
        +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
        +*
        +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
        +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
        +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
        +*
        +* 1. DEFINITIONS
        +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
        +*
        +* 2. LICENSE
        +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute.  LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
        +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
        +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
        +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
        +*
        +* 3. PHONE-HOME FEATURE
        +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM.  Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time.  Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
        +*
        +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
        +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
        +* Copyright 2012-2014 Broad Institute, Inc.
        +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
        +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
        +*
        +* 5. INDEMNIFICATION
        +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
        +*
        +* 6. NO REPRESENTATIONS OR WARRANTIES
        +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
        +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
        +*
        +* 7. ASSIGNMENT
        +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
        +*
        +* 8. MISCELLANEOUS
        +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
        +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
        +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
        +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
        +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
        +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
        +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
        +*/
        +package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
        +
        +import org.broadinstitute.gatk.utils.commandline.Advanced;
        +import org.broadinstitute.gatk.utils.commandline.Argument;
        +import org.broadinstitute.gatk.utils.commandline.Hidden;
        +import org.broadinstitute.gatk.utils.pairhmm.PairHMM;
        +
        +/**
        + * Set of arguments related to {@link ReadLikelihoodCalculationEngine} implementations
        + *
        + * @author Kristian Cibulskis <kcibul@broadinstitute.org>
        + */
        +public class LikelihoodEngineArgumentCollection {
        +    @Advanced
        +    @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false)
        +    public int gcpHMM = 10;
        +
        +    /**
        +     * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
        +     */
        +    @Hidden
        +    @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false)
        +    public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.VECTOR_LOGLESS_CACHING;
        +
        +    /**
        +     * This argument is intended for use in the test suite only. It gives developers the ability to select of the
        +     * hardware dependent vectorized implementation of the vectorized PairHMM library (pairHMM=VECTOR_LOGLESS_CACHING).
        +     * For normal usage, you should rely on the architecture auto-detection.
        +     */
        +    @Hidden
        +    @Advanced
        +    @Argument(fullName = "pair_hmm_sub_implementation", shortName = "pairHMMSub", doc = "The PairHMM machine-dependent sub-implementation to use for genotype likelihood calculations", required = false)
        +    public PairHMM.HMM_SUB_IMPLEMENTATION pairHMMSub = PairHMM.HMM_SUB_IMPLEMENTATION.ENABLE_ALL;
        +
        +    /**
        +     * This argument is intended for use in the test suite only. It gives developers the ability to load different
        +     * hardware dependent sub-implementations (-pairHMMSub) of the vectorized PairHMM library (-pairHMM=VECTOR_LOGLESS_CACHING)
        +     * for each test. Without this option, the library is only loaded once (for the first test executed in the suite) even if
        +     * subsequent tests specify a different implementation.
        +     * Each test will output the corresponding library loading messages.
        +     */
        +    @Hidden
        +    @Advanced
        +    @Argument(fullName = "always_load_vector_logless_PairHMM_lib", shortName = "alwaysloadVectorHMM", doc = "Load the vector logless PairHMM library each time a GATK run is initiated in the test suite", required = false)
        +    public boolean alwaysLoadVectorLoglessPairHMMLib = false;
        +
        +    /**
        +     * The phredScaledGlobalReadMismappingRate reflects the average global mismapping rate of all reads, regardless of their
        +     * mapping quality.  This term effects the probability that a read originated from the reference haplotype, regardless of
        +     * its edit distance from the reference, in that the read could have originated from the reference haplotype but
        +     * from another location in the genome.  Suppose a read has many mismatches from the reference, say like 5, but
        +     * has a very high mapping quality of 60.  Without this parameter, the read would contribute 5 * Q30 evidence
        +     * in favor of its 5 mismatch haplotype compared to reference, potentially enough to make a call off that single
        +     * read for all of these events.  With this parameter set to Q30, though, the maximum evidence against any haplotype
        +     * that this (and any) read could contribute is Q30.
        +     *
        +     * Set this term to any negative number to turn off the global mapping rate.
        +     */
        +    @Advanced
        +    @Argument(fullName="phredScaledGlobalReadMismappingRate", shortName="globalMAPQ", doc="The global assumed mismapping rate for reads", required = false)
        +    public int phredScaledGlobalReadMismappingRate = 45;
        +
        +    @Hidden
        +    @Argument(fullName="noFpga", shortName="noFpga", doc="Disable the use of the FPGA HMM implementation", required = false)
        +    public boolean noFpga = false;
        +
        +
        +
        +}
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/MergeVariantsAcrossHaplotypes.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/MergeVariantsAcrossHaplotypes.java
        new file mode 100644
        index 000000000..7e42a8742
        --- /dev/null
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/MergeVariantsAcrossHaplotypes.java
        @@ -0,0 +1,84 @@
        +/*
        +* By downloading the PROGRAM you agree to the following terms of use:
        +* 
        +* BROAD INSTITUTE
        +* SOFTWARE LICENSE AGREEMENT
        +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
        +* 
        +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
        +* 
        +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
        +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
        +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
        +* 
        +* 1. DEFINITIONS
        +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
        +* 
        +* 2. LICENSE
        +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute.  LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
        +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
        +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
        +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
        +* 
        +* 3. PHONE-HOME FEATURE
        +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM.  Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time.  Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
        +* 
        +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
        +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
        +* Copyright 2012-2014 Broad Institute, Inc.
        +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
        +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
        +* 
        +* 5. INDEMNIFICATION
        +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
        +* 
        +* 6. NO REPRESENTATIONS OR WARRANTIES
        +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
        +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
        +* 
        +* 7. ASSIGNMENT
        +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
        +* 
        +* 8. MISCELLANEOUS
        +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
        +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
        +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
        +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
        +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
        +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
        +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
        +*/
        +
        +package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
        +
        +import org.broadinstitute.gatk.utils.GenomeLoc;
        +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
        +import org.broadinstitute.gatk.utils.haplotype.Haplotype;
        +
        +import java.util.List;
        +import java.util.TreeSet;
        +
        +/**
        + * Baseclass for code that wants to merge variants together in the haplotype caller
        + *
        + * This root class is basically a no-op, and can be used to not do any merging
        + */
        +public class MergeVariantsAcrossHaplotypes {
        +    /**
        +     * Merge variants across the haplotypes, updating the haplotype event maps and startPos set as appropriate
        +     *
        +     * @param haplotypes a list of haplotypes whose events we want to merge
        +     * @param readLikelihoods map from sample name -> read likelihoods for each haplotype
        +     * @param startPosKeySet a set of starting positions of all events among the haplotypes
        +     * @param ref the reference bases
        +     * @param refLoc the span of the reference bases
        +     * @return true if anything was merged
        +     */
        +    public boolean merge( final List haplotypes,
        +                          final ReadLikelihoods readLikelihoods,
        +                          final TreeSet startPosKeySet,
        +                          final byte[] ref,
        +                          final GenomeLoc refLoc ) {
        +        return false;
        +    }
        +}
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java
        index 3c8b0c40d..0f04e5139 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java
        @@ -56,17 +56,17 @@ import com.google.java.contract.Requires;
         import htsjdk.samtools.SAMUtils;
         import htsjdk.variant.variantcontext.Allele;
         import org.apache.log4j.Logger;
        -import org.broadinstitute.gatk.tools.walkers.genotyper.AlleleList;
        -import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedAlleleList;
        -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList;
        +import org.broadinstitute.gatk.utils.genotyper.AlleleList;
        +import org.broadinstitute.gatk.utils.genotyper.IndexedAlleleList;
        +import org.broadinstitute.gatk.utils.genotyper.SampleList;
         import org.broadinstitute.gatk.utils.MathUtils;
         import org.broadinstitute.gatk.utils.QualityUtils;
         import org.broadinstitute.gatk.utils.exceptions.UserException;
         import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
         import org.broadinstitute.gatk.utils.haplotype.Haplotype;
         import org.broadinstitute.gatk.utils.pairhmm.*;
        -import org.broadinstitute.gatk.utils.recalibration.covariates.RepeatCovariate;
        -import org.broadinstitute.gatk.utils.recalibration.covariates.RepeatLengthCovariate;
        +import org.broadinstitute.gatk.engine.recalibration.covariates.RepeatCovariate;
        +import org.broadinstitute.gatk.engine.recalibration.covariates.RepeatLengthCovariate;
         import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
         
         import java.io.File;
        @@ -78,13 +78,13 @@ import java.util.*;
         public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalculationEngine {
             private final static Logger logger = Logger.getLogger(PairHMMLikelihoodCalculationEngine.class);
         
        -    public static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual
        -
             private final byte constantGCP;
         
             private final double log10globalReadMismappingRate;
         
             private final PairHMM.HMM_IMPLEMENTATION hmmType;
        +    private final PairHMM.HMM_SUB_IMPLEMENTATION hmmSubType;
        +    private final boolean alwaysLoadVectorLoglessPairHMMLib;
             private final boolean noFpga;
         
             private final ThreadLocal pairHMMThreadLocal = new ThreadLocal() {
        @@ -101,15 +101,15 @@ public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalcula
                         case VECTOR_LOGLESS_CACHING:
                             try
                             {
        -                        return new VectorLoglessPairHMM();
        +                        return new VectorLoglessPairHMM(hmmSubType, alwaysLoadVectorLoglessPairHMMLib);
                             }
                             catch(UnsatisfiedLinkError ule)
                             {
        -                        logger.debug("Failed to load native library for VectorLoglessPairHMM - using Java implementation of LOGLESS_CACHING");
        +                        logger.warn("Failed to load native library for VectorLoglessPairHMM - using Java implementation of LOGLESS_CACHING");
                                 return new LoglessPairHMM();
                             }
                         case DEBUG_VECTOR_LOGLESS_CACHING:
        -                    return new DebugJNILoglessPairHMM(PairHMM.HMM_IMPLEMENTATION.VECTOR_LOGLESS_CACHING);
        +                    return new DebugJNILoglessPairHMM(PairHMM.HMM_IMPLEMENTATION.VECTOR_LOGLESS_CACHING, hmmSubType, alwaysLoadVectorLoglessPairHMMLib);
                         case ARRAY_LOGLESS:
                             if (noFpga || !CnyPairHMM.isAvailable())
                                 return new ArrayLoglessPairHMM();
        @@ -129,11 +129,22 @@ public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalcula
         
             public enum PCR_ERROR_MODEL {
                 /** no specialized PCR error model will be applied; if base insertion/deletion qualities are present they will be used */
        -        NONE,
        +        NONE(null),
        +        /** a most aggressive model will be applied that sacrifices true positives in order to remove more false positives */
        +        HOSTILE(1.0),
                 /** a more aggressive model will be applied that sacrifices true positives in order to remove more false positives */
        -        AGGRESSIVE,
        +        AGGRESSIVE(2.0),
                 /** a less aggressive model will be applied that tries to maintain a high true positive rate at the expense of allowing more false positives */
        -        CONSERVATIVE
        +        CONSERVATIVE(3.0);
        +
        +        private final Double rateFactor;
        +
        +        /** rate factor is applied to the PCR error model.  Can be null to imply no correction */
        +        PCR_ERROR_MODEL(Double rateFactor) {
        +            this.rateFactor = rateFactor;
        +        }
        +        private Double getRateFactor() { return rateFactor; }
        +        private boolean hasRateFactor() { return rateFactor != null; }
             }
         
             private final PCR_ERROR_MODEL pcrErrorModel;
        @@ -150,6 +161,8 @@ public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalcula
              *
              * @param constantGCP the gap continuation penalty to use with the PairHMM
              * @param hmmType the type of the HMM to use
        +     * @param hmmSubType the type of the machine dependent sub-implementation of HMM to use
        +     * @param alwaysLoadVectorLoglessPairHMMLib always load the vector logless HMM library instead of once
              * @param log10globalReadMismappingRate the global mismapping probability, in log10(prob) units.  A value of
              *                                      -3 means that the chance that a read doesn't actually belong at this
              *                                      location in the genome is 1 in 1000.  The effect of this parameter is
        @@ -159,9 +172,13 @@ public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalcula
              *                                      reference haplotype gets a score of -100 from the pairhmm it will be
              *                                      assigned a likelihood of -13.
              * @param noFpga disable FPGA acceleration
        +     * @param pcrErrorModel model to correct for PCR indel artifacts
              */
        -    public PairHMMLikelihoodCalculationEngine( final byte constantGCP, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate, final boolean noFpga, final PCR_ERROR_MODEL pcrErrorModel ) {
        +    public PairHMMLikelihoodCalculationEngine( final byte constantGCP, final PairHMM.HMM_IMPLEMENTATION hmmType, final PairHMM.HMM_SUB_IMPLEMENTATION hmmSubType,
        +                                               final boolean alwaysLoadVectorLoglessPairHMMLib, final double log10globalReadMismappingRate, final boolean noFpga, final PCR_ERROR_MODEL pcrErrorModel ) {
                 this.hmmType = hmmType;
        +        this.hmmSubType = hmmSubType;
        +        this.alwaysLoadVectorLoglessPairHMMLib = alwaysLoadVectorLoglessPairHMMLib;
                 this.constantGCP = constantGCP;
                 this.log10globalReadMismappingRate = log10globalReadMismappingRate;
                 this.noFpga = noFpga;
        @@ -189,7 +206,7 @@ public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalcula
             private void capMinimumReadQualities(GATKSAMRecord read, byte[] readQuals, byte[] readInsQuals, byte[] readDelQuals) {
                 for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
                     readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG
        -            readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
        +            readQuals[kkk] = ( readQuals[kkk] < PairHMM.BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
                     readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] );
                     readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] );
                 }
        @@ -415,14 +432,14 @@ public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalcula
             private final RepeatCovariate repeatCovariate = new RepeatLengthCovariate();
         
             private void initializePCRErrorModel() {
        -        if ( pcrErrorModel == PCR_ERROR_MODEL.NONE )
        +        if ( pcrErrorModel == PCR_ERROR_MODEL.NONE || !pcrErrorModel.hasRateFactor() )
                     return;
         
                 repeatCovariate.initialize(MAX_STR_UNIT_LENGTH, MAX_REPEAT_LENGTH);
         
                 pcrIndelErrorModelCache = new byte[MAX_REPEAT_LENGTH + 1];
         
        -        final double rateFactor = pcrErrorModel == PCR_ERROR_MODEL.AGGRESSIVE ? 2.0 : 3.0;
        +        final double rateFactor = pcrErrorModel.getRateFactor();
         
                 for( int iii = 0; iii <= MAX_REPEAT_LENGTH; iii++ )
                     pcrIndelErrorModelCache[iii] = getErrorModelAdjustedQual(iii, rateFactor);
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java
        index 1f75402b0..7a134a380 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java
        @@ -52,10 +52,10 @@
         package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
         
         import htsjdk.variant.variantcontext.Allele;
        -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
        -import org.broadinstitute.gatk.tools.walkers.genotyper.AlleleList;
        -import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedAlleleList;
        -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList;
        +import org.broadinstitute.gatk.utils.genotyper.AlleleList;
        +import org.broadinstitute.gatk.utils.genotyper.IndexedAlleleList;
        +import org.broadinstitute.gatk.utils.genotyper.SampleList;
        +import org.broadinstitute.gatk.utils.Utils;
         import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
         import org.broadinstitute.gatk.utils.haplotype.Haplotype;
         import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
        @@ -77,7 +77,7 @@ public class RandomLikelihoodCalculationEngine implements ReadLikelihoodCalculat
                 final AlleleList haplotypes = new IndexedAlleleList<>(assemblyResultSet.getHaplotypeList());
                 final ReadLikelihoods result = new ReadLikelihoods(samples, haplotypes, reads);
                 final Map alleles = new HashMap<>(haplotypes.alleleCount());
        -        final Random rnd = GenomeAnalysisEngine.getRandomGenerator();
        +        final Random rnd = Utils.getRandomGenerator();
                 final int sampleCount = samples.sampleCount();
                 final int alleleCount = haplotypes.alleleCount();
                 for (int i = 0; i < sampleCount; i++)  {
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadLikelihoodCalculationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadLikelihoodCalculationEngine.java
        index 14fc080b5..6dbcd161e 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadLikelihoodCalculationEngine.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadLikelihoodCalculationEngine.java
        @@ -51,7 +51,7 @@
         
         package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
         
        -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList;
        +import org.broadinstitute.gatk.utils.genotyper.SampleList;
         import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
         import org.broadinstitute.gatk.utils.haplotype.Haplotype;
         import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingAssemblerArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingAssemblerArgumentCollection.java
        new file mode 100644
        index 000000000..14b6b4361
        --- /dev/null
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingAssemblerArgumentCollection.java
        @@ -0,0 +1,204 @@
        +/*
        +* By downloading the PROGRAM you agree to the following terms of use:
        +*
        +* BROAD INSTITUTE
        +* SOFTWARE LICENSE AGREEMENT
        +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
        +*
        +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
        +*
        +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
        +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
        +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
        +*
        +* 1. DEFINITIONS
        +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
        +*
        +* 2. LICENSE
        +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute.  LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
        +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
        +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
        +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
        +*
        +* 3. PHONE-HOME FEATURE
        +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM.  Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time.  Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
        +*
        +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
        +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
        +* Copyright 2012-2014 Broad Institute, Inc.
        +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
        +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
        +*
        +* 5. INDEMNIFICATION
        +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
        +*
        +* 6. NO REPRESENTATIONS OR WARRANTIES
        +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
        +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
        +*
        +* 7. ASSIGNMENT
        +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
        +*
        +* 8. MISCELLANEOUS
        +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
        +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
        +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
        +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
        +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
        +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
        +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
        +*/
        +package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
        +
        +import org.broadinstitute.gatk.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
        +import org.broadinstitute.gatk.utils.commandline.Advanced;
        +import org.broadinstitute.gatk.utils.commandline.Argument;
        +import org.broadinstitute.gatk.utils.commandline.Hidden;
        +import org.broadinstitute.gatk.utils.commandline.Output;
        +
        +import java.io.PrintStream;
        +import java.util.Arrays;
        +import java.util.List;
        +
        +/**
        + * Set of arguments related to the {@link ReadThreadingAssembler}
        + *
        + * @author Kristian Cibulskis <kcibul@broadinstitute.org>
        + */
        +public class ReadThreadingAssemblerArgumentCollection {
        +
        +    // -----------------------------------------------------------------------------------------------
        +    // arguments to control internal behavior of the read threading assembler
        +    // -----------------------------------------------------------------------------------------------
        +
        +    /**
        +     * Multiple kmer sizes can be specified, using e.g. `-kmerSize 10 -kmerSize 25`.
        +     */
        +    @Advanced
        +    @Argument(fullName="kmerSize", shortName="kmerSize", doc="Kmer size to use in the read threading assembler", required = false)
        +    public List kmerSizes = Arrays.asList(10, 25);
        +
        +    /**
        +     * When graph cycles are detected, the normal behavior is to increase kmer sizes iteratively until the cycles are
        +     * resolved. Disabling this behavior may cause the program to give up on assembling the ActiveRegion.
        +     */
        +    @Advanced
        +    @Argument(fullName="dontIncreaseKmerSizesForCycles", shortName="dontIncreaseKmerSizesForCycles", doc="Disable iterating over kmer sizes when graph cycles are detected", required = false)
        +    public boolean dontIncreaseKmerSizesForCycles = false;
        +
        +    /**
        +     * By default, the program does not allow processing of reference sections that contain non-unique kmers. Disabling
        +     * this check may cause problems in the assembly graph.
        +     */
        +    @Advanced
        +    @Argument(fullName="allowNonUniqueKmersInRef", shortName="allowNonUniqueKmersInRef", doc="Allow graphs that have non-unique kmers in the reference", required = false)
        +    public boolean allowNonUniqueKmersInRef = false;
        +
        +    /**
        +     * If fewer samples than the specified number pass the minPruning threshold for a given path, that path will be eliminated from the graph.
        +     */
        +    @Advanced
        +    @Argument(fullName="numPruningSamples", shortName="numPruningSamples", doc="Number of samples that must pass the minPruning threshold", required = false)
        +    public int numPruningSamples = 1;
        +
        +    /**
        +     * As of version 3.3, this argument is no longer needed because dangling end recovery is now the default behavior. See GATK 3.3 release notes for more details.
        +     */
        +    @Deprecated
        +    @Argument(fullName="recoverDanglingHeads", shortName="recoverDanglingHeads", doc="This argument is deprecated since version 3.3", required = false)
        +    public boolean DEPRECATED_RecoverDanglingHeads = false;
        +
        +    /**
        +     * By default, the read threading assembler will attempt to recover dangling heads and tails. See the `minDanglingBranchLength` argument documentation for more details.
        +     */
        +    @Hidden
        +    @Argument(fullName="doNotRecoverDanglingBranches", shortName="doNotRecoverDanglingBranches", doc="Disable dangling head and tail recovery", required = false)
        +    public boolean doNotRecoverDanglingBranches = false;
        +
        +    /**
        +     * When constructing the assembly graph we are often left with "dangling" branches.  The assembly engine attempts to rescue these branches
        +     * by merging them back into the main graph.  This argument describes the minimum length of a dangling branch needed for the engine to
        +     * try to rescue it.  A smaller number here will lead to higher sensitivity to real variation but also to a higher number of false positives.
        +     */
        +    @Advanced
        +    @Argument(fullName="minDanglingBranchLength", shortName="minDanglingBranchLength", doc="Minimum length of a dangling branch to attempt recovery", required = false)
        +    public int minDanglingBranchLength = 4;
        +
        +    /**
        +     * This argument is specifically intended for 1000G consensus analysis mode. Setting this flag will inject all
        +     * provided alleles to the assembly graph but will not forcibly genotype all of them.
        +     */
        +    @Advanced
        +    @Argument(fullName="consensus", shortName="consensus", doc="1000G consensus mode", required = false)
        +    public boolean consensusMode = false;
        +
        +    /**
        +     * The assembly graph can be quite complex, and could imply a very large number of possible haplotypes.  Each haplotype
        +     * considered requires N PairHMM evaluations if there are N reads across all samples.  In order to control the
        +     * run of the haplotype caller we only take maxNumHaplotypesInPopulation paths from the graph, in order of their
        +     * weights, no matter how many paths are possible to generate from the graph.  Putting this number too low
        +     * will result in dropping true variation because paths that include the real variant are not even considered.
        +     * You can consider increasing this number when calling organisms with high heterozygosity.
        +     */
        +    @Advanced
        +    @Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population", required = false)
        +    public int maxNumHaplotypesInPopulation = 128;
        +
        +    /**
        +     * Enabling this argument may cause fundamental problems with the assembly graph itself.
        +     */
        +    @Hidden
        +    @Argument(fullName="errorCorrectKmers", shortName="errorCorrectKmers", doc = "Use an exploratory algorithm to error correct the kmers used during assembly", required=false)
        +    public boolean errorCorrectKmers = false;
        +
        +    /**
        +     * Paths with fewer supporting kmers than the specified threshold will be pruned from the graph.
        +     *
        +     * Be aware that this argument can dramatically affect the results of variant calling and should only be used with great caution.
        +     * Using a prune factor of 1 (or below) will prevent any pruning from the graph, which is generally not ideal; it can make the
        +     * calling much slower and even less accurate (because it can prevent effective merging of "tails" in the graph).  Higher values
        +     * tend to make the calling much faster, but also lowers the sensitivity of the results (because it ultimately requires higher
        +     * depth to produce calls).
        +     */
        +    @Advanced
        +    @Argument(fullName="minPruning", shortName="minPruning", doc = "Minimum support to not prune paths in the graph", required = false)
        +    public int MIN_PRUNE_FACTOR = 2;
        +
        +    @Hidden
        +    @Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="Write DOT formatted graph files out of the assembler for only this graph size", required = false)
        +    public boolean debugGraphTransformations = false;
        +
        +    @Hidden
        +    @Argument(fullName="allowCyclesInKmerGraphToGeneratePaths", shortName="allowCyclesInKmerGraphToGeneratePaths", doc="Allow cycles in the kmer graphs to generate paths with multiple copies of the path sequenece rather than just the shortest paths", required = false)
        +    public boolean allowCyclesInKmerGraphToGeneratePaths = false;
        +
        +    /**
        +     * This argument is meant for debugging and is not immediately useful for normal analysis use.
        +     */
        +    @Output(fullName="graphOutput", shortName="graph", doc="Write debug assembly graph information to this file", required = false, defaultToStdout = false)
        +    public PrintStream graphWriter = null;
        +
        +    //---------------------------------------------------------------------------------------------------------------
        +    //
        +    // Read Error Corrector Related Parameters
        +    //
        +    // ---------------------------------------------------------------------------------------------------------------
        +
        +    /**
        +     * Enabling this argument may cause fundamental problems with the assembly graph itself.
        +     */
        +    @Hidden
        +    @Argument(fullName="kmerLengthForReadErrorCorrection", shortName="kmerLengthForReadErrorCorrection", doc = "Use an exploratory algorithm to error correct the kmers used during assembly", required=false)
        +    public int kmerLengthForReadErrorCorrection = 25;
        +
        +    @Hidden
        +    @Argument(fullName="minObservationsForKmerToBeSolid", shortName="minObservationsForKmerToBeSolid", doc = "A k-mer must be seen at least these times for it considered to be solid", required=false)
        +    public int minObservationsForKmerToBeSolid = 20;
        +
        +
        +
        +
        +
        +
        +
        +}
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java
        index 0599ee880..d674800ad 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java
        @@ -55,14 +55,14 @@ import htsjdk.samtools.*;
         import htsjdk.variant.variantcontext.*;
         import htsjdk.variant.vcf.VCFHeaderLine;
         import htsjdk.variant.vcf.VCFSimpleHeaderLine;
        -import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
        +import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
         import org.broadinstitute.gatk.tools.walkers.genotyper.*;
         import org.broadinstitute.gatk.utils.GenomeLoc;
         import org.broadinstitute.gatk.utils.GenomeLocParser;
         import org.broadinstitute.gatk.utils.MathUtils;
         import org.broadinstitute.gatk.utils.QualityUtils;
         import org.broadinstitute.gatk.utils.activeregion.ActiveRegion;
        -import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
        +import org.broadinstitute.gatk.utils.genotyper.*;
         import org.broadinstitute.gatk.utils.haplotype.Haplotype;
         import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState;
         import org.broadinstitute.gatk.utils.pileup.PileupElement;
        @@ -70,6 +70,7 @@ import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
         import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl;
         import org.broadinstitute.gatk.utils.sam.AlignmentUtils;
         import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
        +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
         import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
         
         import java.io.File;
        @@ -87,9 +88,6 @@ import java.util.*;
          */
         public class ReferenceConfidenceModel {
         
        -    //public final static String INDEL_INFORMATIVE_DEPTH = "CD"; // temporarily taking this extra genotype level information out for now
        -    public final static String ALTERNATE_ALLELE_STRING = "ALT"; // arbitrary alternate allele
        -
             private final GenomeLocParser genomeLocParser;
         
             private final SampleList samples;
        @@ -137,9 +135,7 @@ public class ReferenceConfidenceModel {
              */
             public Set getVCFHeaderLines() {
                 final Set headerLines = new LinkedHashSet<>();
        -        // TODO - do we need a new kind of VCF Header subclass for specifying arbitrary alternate alleles?
        -        headerLines.add(new VCFSimpleHeaderLine(ALTERNATE_ALLELE_STRING, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location"));
        -        //headerLines.add(new VCFFormatHeaderLine(INDEL_INFORMATIVE_DEPTH, 1, VCFHeaderLineType.Integer, "Number of reads at locus that are informative about an indel of size <= " + indelInformativeDepthIndelSize));
        +        headerLines.add(new VCFSimpleHeaderLine(GATKVCFConstants.SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location"));
                 return headerLines;
             }
         
        @@ -215,7 +211,7 @@ public class ReferenceConfidenceModel {
                         homRefCalc.capByHomRefLikelihood();
         
                         final Allele refAllele = Allele.create(refBase, true);
        -                final List refSiteAlleles = Arrays.asList(refAllele, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
        +                final List refSiteAlleles = Arrays.asList(refAllele, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
                         final VariantContextBuilder vcb = new VariantContextBuilder("HC", curPos.getContig(), curPos.getStart(), curPos.getStart(), refSiteAlleles);
                         final GenotypeBuilder gb = new GenotypeBuilder(sampleName, GATKVariantContextUtils.homozygousAlleleList(refAllele, ploidy));
                         gb.AD(homRefCalc.AD_Ref_Any);
        @@ -265,26 +261,23 @@ public class ReferenceConfidenceModel {
              * @return non-null GenotypeLikelihoods given N
              */
             protected final GenotypeLikelihoods getIndelPLs(final int ploidy, final int nInformativeReads) {
        -        if (ploidy > MAX_N_INDEL_PLOIDY)
        -            throw new IllegalArgumentException("you have hit a current limitation of the GVCF output model that cannot handle ploidies larger than " + MAX_N_INDEL_PLOIDY + " , please let the GATK team about it: " + ploidy);
                 return indelPLCache(ploidy, nInformativeReads > MAX_N_INDEL_INFORMATIVE_READS ? MAX_N_INDEL_INFORMATIVE_READS : nInformativeReads);
             }
         
             protected static final int MAX_N_INDEL_INFORMATIVE_READS = 40; // more than this is overkill because GQs are capped at 99 anyway
        -    private static final int MAX_N_INDEL_PLOIDY = 20;
        -    private static final GenotypeLikelihoods[][] indelPLCache = new GenotypeLikelihoods[MAX_N_INDEL_PLOIDY][];
        +    private static final int INITIAL_INDEL_LK_CACHE_PLOIDY_CAPACITY = 20;
        +    private static GenotypeLikelihoods[][] indelPLCache = new GenotypeLikelihoods[INITIAL_INDEL_LK_CACHE_PLOIDY_CAPACITY + 1][];
             private static final double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp
         
             private final GenotypeLikelihoods indelPLCache(final int ploidy, final int nInformativeReads) {
        -        GenotypeLikelihoods[] indelPLCacheByPloidy = indelPLCache[ploidy];
        -        if (indelPLCacheByPloidy == null)
        -            return initializeIndelPLCache(ploidy)[nInformativeReads];
        -        else
        -            return indelPLCacheByPloidy[nInformativeReads];
        +        return initializeIndelPLCache(ploidy)[nInformativeReads];
             }
         
             private synchronized GenotypeLikelihoods[] initializeIndelPLCache(final int ploidy) {
        -        // Double-check whether another thread has done the initialization.
        +
        +        if (indelPLCache.length <= ploidy)
        +            indelPLCache = Arrays.copyOf(indelPLCache,ploidy << 1);
        +
                 if (indelPLCache[ploidy] != null)
                     return indelPLCache[ploidy];
         
        @@ -308,42 +301,6 @@ public class ReferenceConfidenceModel {
                 return result;
             }
         
        -    /**
        -     * Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt
        -     *
        -     * @param pileup the read backed pileup containing the data we want to evaluate
        -     * @param refBase the reference base at this pileup position
        -     * @param minBaseQual the min base quality for a read in the pileup at the pileup position to be included in the calculation
        -     * @param hqSoftClips running average data structure (can be null) to collect information about the number of high quality soft clips
        -     * @return a RefVsAnyResult genotype call
        -     */
        -    @Deprecated
        -    public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final MathUtils.RunningAverage hqSoftClips) {
        -        final RefVsAnyResult result = new RefVsAnyResult();
        -
        -        for( final PileupElement p : pileup ) {
        -            final byte qual = (p.isDeletion() ? REF_MODEL_DELETION_QUAL : p.getQual());
        -            if( p.isDeletion() || qual > minBaseQual ) {
        -                int AA = 0; final int AB = 1; int BB = 2;
        -                if( p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) {
        -                    AA = 2;
        -                    BB = 0;
        -                    if( hqSoftClips != null && p.isNextToSoftClip() ) {
        -                        hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(p.getRead(), (byte) 28));
        -                    }
        -                    result.AD_Ref_Any[1]++;
        -                } else {
        -                    result.AD_Ref_Any[0]++;
        -                }
        -                result.genotypeLikelihoods[AA] += QualityUtils.qualToProbLog10(qual);
        -                result.genotypeLikelihoods[AB] += MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + MathUtils.LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD + MathUtils.LOG_ONE_HALF );
        -                result.genotypeLikelihoods[BB] += QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD;
        -            }
        -        }
        -
        -        return result;
        -    }
        -
             /**
              * Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt
              *
        @@ -359,7 +316,7 @@ public class ReferenceConfidenceModel {
             public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final String sampleName, final int ploidy,
                                                                 final GenotypingModel genotypingModel,
                                                                 final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final MathUtils.RunningAverage hqSoftClips) {
        -        final AlleleList alleleList = new IndexedAlleleList<>(Allele.create(refBase,true),GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
        +        final AlleleList alleleList = new IndexedAlleleList<>(Allele.create(refBase,true), GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
                 // Notice that the sample name is rather irrelevant as this information is never used, just need to be the same in both lines bellow.
         
                 final int maximumReadCount = pileup.getReads().size();
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/graphs/EmptyPathHaplotypeFinder.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/graphs/EmptyPathHaplotypeFinderNode.java
        similarity index 100%
        rename from protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/graphs/EmptyPathHaplotypeFinder.java
        rename to protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/graphs/EmptyPathHaplotypeFinderNode.java
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java
        index fa54a4fda..f9de3173a 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java
        @@ -124,7 +124,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
                     while ( results.isEmpty() && numIterations <= MAX_KMER_ITERATIONS_TO_ATTEMPT ) {
                         // on the last attempt we will allow low complexity graphs
                         final boolean lastAttempt = numIterations == MAX_KMER_ITERATIONS_TO_ATTEMPT;
        -                addResult(results, createGraph(reads, refHaplotype, kmerSize, givenHaplotypes, lastAttempt, lastAttempt));
        +                addResult(results, createGraph(reads, refHaplotype, kmerSize, givenHaplotypes, lastAttempt, allowNonUniqueKmersInRef || lastAttempt));
                         kmerSize += KMER_SIZE_ITERATION_INCREASE;
                         numIterations++;
                     }
        diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealigner.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealigner.java
        index 0652a767c..75df30550 100644
        --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealigner.java
        +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealigner.java
        @@ -58,11 +58,10 @@ import htsjdk.samtools.util.StringUtil;
         import htsjdk.tribble.Feature;
         import org.broadinstitute.gatk.utils.commandline.*;
         import org.broadinstitute.gatk.engine.CommandLineGATK;
        -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
        -import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
        -import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter;
        +import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
        +import org.broadinstitute.gatk.utils.sam.GATKSAMFileWriter;
         import org.broadinstitute.gatk.engine.iterators.ReadTransformer;
        -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
        +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
         import org.broadinstitute.gatk.engine.walkers.BAQMode;
         import org.broadinstitute.gatk.engine.walkers.ReadWalker;
         import org.broadinstitute.gatk.utils.BaseUtils;
        @@ -80,9 +79,8 @@ import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature;
         import org.broadinstitute.gatk.utils.help.HelpConstants;
         import org.broadinstitute.gatk.utils.sam.AlignmentUtils;
         import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
        -import org.broadinstitute.gatk.utils.sam.NWaySAMFileWriter;
        +import org.broadinstitute.gatk.engine.io.NWaySAMFileWriter;
         import org.broadinstitute.gatk.utils.sam.ReadUtils;
        -import org.broadinstitute.gatk.utils.text.TextFormattingUtils;
         import org.broadinstitute.gatk.utils.text.XReadLines;
         import htsjdk.variant.variantcontext.VariantContext;
         
        @@ -93,10 +91,10 @@ import java.io.IOException;
         import java.util.*;
         
         /**
        - * Performs local realignment of reads to correct misalignments due to the presence of indels.
        + * Perform local realignment of reads around indels
          *
          * 

        - * The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases + * The local realignment process is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion * or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, @@ -107,12 +105,13 @@ import java.util.*; * appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and * specifically identify indels. *

        - *
          There are 2 steps to the realignment process: + *

          There are 2 steps to the realignment process:

          + *
            *
          1. Determining (small) suspicious intervals which are likely in need of realignment (see the RealignerTargetCreator tool)
          2. *
          3. Running the realigner over those intervals (IndelRealigner)
          4. *
          *

          - * For more details, see http://www.broadinstitute.org/gatk/guide/article?id=38 + * For more details, see the indel realignment method documentation. *

          * *

          Input

          @@ -125,26 +124,24 @@ import java.util.*; * A realigned version of your input BAM file(s). *

          * - *

          Example

          + *

          Usage example

          *
          - * java -Xmx4g -jar GenomeAnalysisTK.jar \
          + * java -jar GenomeAnalysisTK.jar \
            *   -T IndelRealigner \
          - *   -R ref.fasta \
          + *   -R reference.fasta \
            *   -I input.bam \
          + *   --known indels.vcf \
            *   -targetIntervals intervalListFromRTC.intervals \
          - *   -o realignedBam.bam \
          - *   [-known /path/to/indels.vcf] \
          - *   [-compress 0]    (this argument recommended to speed up the process *if* this is only a temporary file; otherwise, use the default value)
          + *   -o realignedBam.bam
            * 
          * *

          Caveats

          - * - *
          • - * An important note: the input bam(s), reference, and known indel file(s) should be the same ones used for the RealignerTargetCreator step. - *
          • - * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them - * (or with reads from similar technologies). - *
          + *
            + *
          • The input BAM(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step.
          • + *
          • Because reads produced from the 454 technology inherently contain false indels, the realigner will not work with them + * (or with reads from similar technologies).
          • + *
          • This tool also ignores MQ0 reads and reads with consecutive indel operators in the CIGAR string.
          • + *
          * * @author ebanks */ @@ -403,7 +400,7 @@ public class IndelRealigner extends ReadWalker { throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile,ex); } - intervals = intervalsFile.getIntervals(getToolkit()).iterator(); + intervals = intervalsFile.getIntervals(getToolkit().getGenomeLocParser()).iterator(); currentInterval = intervals.hasNext() ? intervals.next() : null; @@ -477,10 +474,8 @@ public class IndelRealigner extends ReadWalker { if ( NO_PG_TAG ) return null; final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME); - final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); try { - final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version"); - programRecord.setProgramVersion(version); + programRecord.setProgramVersion(CommandLineProgram.getVersionNumber()); } catch (MissingResourceException e) { // this is left empty on purpose (perhaps Andrey knows why?) } @@ -991,7 +986,7 @@ public class IndelRealigner extends ReadWalker { else { int readsSeen = 0; while ( readsSeen++ < MAX_READS_FOR_CONSENSUSES && altConsensesToPopulate.size() <= MAX_CONSENSUSES) { - int index = GenomeAnalysisEngine.getRandomGenerator().nextInt(altAlignmentsToTest.size()); + int index = Utils.getRandomGenerator().nextInt(altAlignmentsToTest.size()); AlignedRead aRead = altAlignmentsToTest.remove(index); if ( CHECKEARLY ) createAndAddAlternateConsensus1(aRead, altConsensesToPopulate, reference,leftmostIndex); else createAndAddAlternateConsensus(aRead.getReadBases(), altConsensesToPopulate, reference); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/LeftAlignIndels.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/LeftAlignIndels.java index ba1ad9db2..a6afa812a 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/LeftAlignIndels.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/LeftAlignIndels.java @@ -55,9 +55,9 @@ import htsjdk.samtools.Cigar; import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.sam.GATKSAMFileWriter; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.ReadWalker; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; @@ -66,30 +66,35 @@ import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; /** - * Left-aligns indels from reads in a bam file. + * Left-align indels within reads in a bam file * - *

          - * LeftAlignIndels is a tool that takes a bam file and left-aligns any indels inside it. The same indel can often be - * placed at multiple positions and still represent the same haplotype. While a standard convention is to place an - * indel at the left-most position this doesn't always happen, so this tool can be used to left-align them. + *

          This tool left-aligns any indels within read cigars in order to standardize representation when there are multiple valid + * representations possible (i.e. where the same indel can be placed at multiple positions and still represent the same haplotype). + * The standard convention is to place an indel at the left-most position possible, but this is not always followed, so + * this tool can be used to correct the representation of indels.

          + * + *

          Note

          + *

          This is only really needed when calling variants with legacy locus-based tools such as UnifiedGenotyper. With more + * sophisticated tools (like HaplotypeCaller) that involve reconstructing haplotypes (eg through reassembly), the problem + * of multiple valid representations is handled internally and does not need to be corrected explicitly.

          * *

          Input

          *

          - * A bam file to left-align. + * A bam file with mapped reads. *

          * *

          Output

          *

          - * A left-aligned bam. + * A bam file in which indels have been left-aligned where appropriate. *

          * - *

          Examples

          + *

          Usage example

          *
          - * java -Xmx3g -jar GenomeAnalysisTK.jar \
          - *   -R ref.fasta \
          + * java -jar GenomeAnalysisTK.jar \
          + *   -R reference.fasta \
            *   -T LeftAlignIndels \
          - *   -I input.bam \
          - *   -o output.vcf
          + *   -I reads.bam \
          + *   -o output_with_leftaligned_indels.bam
            * 
          * */ diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java index 27abc48ad..ba9c985db 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java @@ -53,10 +53,10 @@ package org.broadinstitute.gatk.tools.walkers.indels; import com.google.java.contract.Ensures; import htsjdk.variant.variantcontext.Allele; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.tools.walkers.genotyper.AlleleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedAlleleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedSampleList; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.genotyper.AlleleList; +import org.broadinstitute.gatk.utils.genotyper.IndexedAlleleList; +import org.broadinstitute.gatk.utils.genotyper.IndexedSampleList; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.clipping.ReadClipper; import org.broadinstitute.gatk.utils.exceptions.UserException; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/RealignerTargetCreator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/RealignerTargetCreator.java index 69ef455d6..a81af2e5a 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/RealignerTargetCreator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/RealignerTargetCreator.java @@ -57,11 +57,11 @@ import org.broadinstitute.gatk.utils.commandline.Input; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.utils.commandline.RodBinding; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.engine.filters.*; import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; @@ -77,57 +77,62 @@ import java.util.List; import java.util.TreeSet; /** - * Emits intervals for the Local Indel Realigner to target for realignment. + * Define intervals to target for local realignment * *

          - * The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases + * The local realignment process is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion * or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, * it is impossible to place reads on the reference genome such that mismatches are minimized across all reads. Consequently, even when some reads are * correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel, * also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus - * indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an - * appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and - * specifically identify indels. - *

          - *

            There are 2 steps to the realignment process: + * indel suitable for standard variant discovery approaches. Unlike most mappers, this tool uses the full alignment context to determine whether an + * appropriate alternate reference (i.e. indel) exists. + *

            + *

            There are 2 steps to the realignment process:

            + *
              *
            1. Determining (small) suspicious intervals which are likely in need of realignment (RealignerTargetCreator)
            2. *
            3. Running the realigner over those intervals (see the IndelRealigner tool)
            4. *
            *

            - * Important note 1: the input BAM(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step. *

            - * Important note 2: when multiple potential indels are found by the tool in the same general region, the tool will choose the most likely - * one for realignment to the exclusion of the others. This is a known limitation of the tool. - *

            - * Important note 3: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them - * (or with reads from similar technologies). This tool also ignores MQ0 reads and reads with consecutive indel operators in the CIGAR string. + * For more details, see the indel realignment method documentation. + *

            * - *

            Input

            + *

            Inputs

            *

            - * One or more aligned BAM files and optionally one or more lists of known indels. + * One or more aligned BAM files and optionally, one or more lists of known indels. *

            * *

            Output

            *

            - * A list of target intervals to pass to the Indel Realigner. + * A list of target intervals to pass to the IndelRealigner. *

            * - *

            Examples

            + *

            Usage example

            *
            - * java -Xmx2g -jar GenomeAnalysisTK.jar \
            + * java -jar GenomeAnalysisTK.jar \
              *   -T RealignerTargetCreator \
            - *   -R ref.fasta \
            + *   -R reference.fasta \
              *   -I input.bam \
            - *   -o forIndelRealigner.intervals \
            - *   [--known /path/to/indels.vcf]
            + *   --known indels.vcf \
            + *   -o forIndelRealigner.intervals
              * 
            * - * @author ebanks + *

            Notes

            + *
              + *
            • The input BAM(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step.
            • + *
            • When multiple potential indels are found by the tool in the same general region, the tool will choose the most likely + * one for realignment to the exclusion of the others. This is a known limitation of the tool.
            • + *
            • Because reads produced from the 454 technology inherently contain false indels, the realigner will not work with them + * (or with reads from similar technologies).
            • + *
            • This tool also ignores MQ0 reads and reads with consecutive indel operators in the CIGAR string.
            • + *
            + * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) -@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, BadMateFilter.class, Platform454Filter.class, BadCigarFilter.class}) +@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, BadMateFilter.class, Platform454Filter.class}) @Reference(window=@Window(start=-1,stop=50)) @Allows(value={DataSource.READS, DataSource.REFERENCE}) @By(DataSource.REFERENCE) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhaseByTransmission.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhaseByTransmission.java index f43d8377c..709711064 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhaseByTransmission.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhaseByTransmission.java @@ -56,15 +56,17 @@ import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.samples.Sample; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; @@ -76,17 +78,25 @@ import java.io.PrintStream; import java.util.*; /** - * Computes the most likely genotype combination and phases trios and parent/child pairs + * Compute the most likely genotype combination and phasing for trios and parent/child pairs * *

            - * PhaseByTransmission is a GATK tool that 1) computes the most likely genotype combination and phases trios and parent/child pairs given their genotype likelihoods and a mutation prior and 2) phases - * all sites were parent/child transmission can be inferred unambiguously. It reports the genotype combination (and hence phasing) probability. - * Ambiguous sites are: + * This tool performs two functions: + *

            + *
              + *
            1. Compute the most likely genotype combination of trios and parent/child pairs given their genotype likelihoods and a mutation prior;
            2. + *
            3. Phase all sites were parent/child transmission can be inferred unambiguously.
            4. + *
            + * + *

            The tool ultimately reports the genotype combination (and hence phasing) probability.

            + * + *

            Ambiguous sites are:

            *
              *
            • Sites where all individuals are heterozygous
            • *
            • Sites where there is a Mendelian violation
            • *
            - * Missing genotypes are handled as follows: + * + *

            Missing genotypes are handled as follows:

            *
              *
            • In parent/child pairs: If an individual genotype is missing at one site, the other one is phased if it is homozygous. No phasing probability is emitted.
            • *
            • In trios: If the child is missing, parents are treated as separate individuals and phased if homozygous. No phasing probability is emitted.
            • @@ -102,26 +112,26 @@ import java.util.*; *
            *

            * - *

            Options

            - *

            + *

            Important options

            *
              - *
            • MendelianViolationsFile: An optional argument for reporting. If a file is specified, all sites that remain in mendelian violation after being assigned the most likely genotype - * combination will be reported there. Information reported: chromosome, position, filter, allele count in VCF, family, transmission probability, - * and each individual genotype, depth, allelic depth and likelihoods.
            • + *
            • MendelianViolationsFile: An optional argument for reporting. If a file is specified, all sites that + * remain in mendelian violation after being assigned the most likely genotype combination will be reported + * there. Information reported: chromosome, position, filter, allele count in VCF, family, transmission + * probability, and each individual genotype, depth, allelic depth and likelihoods.
            • *
            • DeNovoPrior: Mutation prio; default is 1e-8
            • *
            - *

            * *

            Output

            *

            - * An VCF with genotypes recalibrated as most likely under the familial constraint and phased by descent where non ambiguous.. + * An VCF with genotypes recalibrated as most likely under the familial constraint and phased by descent (where non + * ambiguous). *

            * - *

            Examples

            + *

            Usage example

            *
            - * java -Xmx2g -jar GenomeAnalysisTK.jar \
            - *   -R ref.fasta \
            + * java -jar GenomeAnalysisTK.jar \
              *   -T PhaseByTransmission \
            + *   -R reference.fasta \
              *   -V input.vcf \
              *   -ped input.ped \
              *   -o output.vcf
            @@ -146,7 +156,6 @@ public class PhaseByTransmission extends RodWalker, HashMa
                 @Output
                 protected VariantContextWriter vcfWriter = null;
             
            -    private final String TRANSMISSION_PROBABILITY_TAG_NAME = "TP";
                 private final String SOURCE_NAME = "PhaseByTransmission";
             
                 public final double NO_TRANSMISSION_PROB = -1.0;
            @@ -414,7 +423,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
                        Map genotypeAttributes = new HashMap();
                        genotypeAttributes.putAll(genotype.getExtendedAttributes());
                        if(transmissionProb>NO_TRANSMISSION_PROB)
            -                genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission);
            +                genotypeAttributes.put(GATKVCFConstants.TRANSMISSION_PROBABILITY_KEY, phredScoreTransmission);
             
                         ArrayList phasedAlleles = new ArrayList(2);
                         for(Allele allele : phasedGenotype.getAlleles()){
            @@ -461,7 +470,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
             
                     Set headerLines = new HashSet();
                     headerLines.addAll(GATKVCFUtils.getHeaderFields(this.getToolkit()));
            -        headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred score of the genotype combination and phase given that the genotypes are correct"));
            +        headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.TRANSMISSION_PROBABILITY_KEY));
                     headerLines.add(new VCFHeaderLine("source", SOURCE_NAME));
                     vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples));
             
            @@ -879,7 +888,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
                                 updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters);
                                 mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s",
                                         vc.getChr(),vc.getStart(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.getFamilyID(),
            -                            phasedMother.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getDP(),printAD(phasedMother.getAD()),
            +                            phasedMother.getExtendedAttribute(GATKVCFConstants.TRANSMISSION_PROBABILITY_KEY),phasedMother.getGenotypeString(),phasedMother.getDP(),printAD(phasedMother.getAD()),
                                         phasedMother.getLikelihoodsString(), phasedFather.getGenotypeString(),phasedFather.getDP(),printAD(phasedFather.getAD()),phasedFather.getLikelihoodsString(),
                                         phasedChild.getGenotypeString(),phasedChild.getDP(),printAD(phasedChild.getAD()),phasedChild.getLikelihoodsString());
                                 if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType()))
            @@ -891,7 +900,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
                                     metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
                                 mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s:%s:%s:%s\t.\t.\t.\t.\t%s\t%s\t%s\t%s",
                                         vc.getChr(),vc.getStart(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.getFamilyID(),
            -                            phasedMother.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getDP(),printAD(phasedMother.getAD()),phasedMother.getLikelihoodsString(),
            +                            phasedMother.getExtendedAttribute(GATKVCFConstants.TRANSMISSION_PROBABILITY_KEY),phasedMother.getGenotypeString(),phasedMother.getDP(),printAD(phasedMother.getAD()),phasedMother.getLikelihoodsString(),
                                         phasedChild.getGenotypeString(),phasedChild.getDP(),printAD(phasedChild.getAD()),phasedChild.getLikelihoodsString());
                             }
                         }
            @@ -902,7 +911,7 @@ public class PhaseByTransmission extends RodWalker, HashMa
                                 metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
                             mvfLine =   String.format("%s\t%d\t%s\t%s\t%s\t.\t.\t.\t.\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s",
                                     vc.getChr(),vc.getStart(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.getFamilyID(),
            -                        phasedFather.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getDP(),printAD(phasedFather.getAD()),phasedFather.getLikelihoodsString(),
            +                        phasedFather.getExtendedAttribute(GATKVCFConstants.TRANSMISSION_PROBABILITY_KEY),phasedFather.getGenotypeString(),phasedFather.getDP(),printAD(phasedFather.getAD()),phasedFather.getLikelihoodsString(),
                                     phasedChild.getGenotypeString(),phasedChild.getDP(),printAD(phasedChild.getAD()),phasedChild.getLikelihoodsString());
                         }
             
            diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtils.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtils.java
            index a7547ff9c..686cd6d87 100644
            --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtils.java
            +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtils.java
            @@ -56,46 +56,36 @@ import htsjdk.samtools.util.StringUtil;
             import org.broadinstitute.gatk.utils.GenomeLoc;
             import org.broadinstitute.gatk.utils.GenomeLocParser;
             import org.broadinstitute.gatk.utils.Utils;
            +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
             import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
             import htsjdk.variant.vcf.VCFConstants;
            -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
             import htsjdk.variant.variantcontext.*;
             
             import java.util.*;
             
             /**
            - * [Short one sentence description of this walker]
            - * 

            - *

            - * [Functionality of this walker] - *

            - *

            - *

            Input

            - *

            - * [Input description] - *

            - *

            - *

            Output

            - *

            - * [Output description] - *

            - *

            - *

            Examples

            - *
            - *    java
            - *      -jar GenomeAnalysisTK.jar
            - *      -T $WalkerName
            - *  
            - * - * @author Your Name - * @since Date created + * Utility class for phasing analysis */ class PhasingUtils { + + /** + * Merge variants into a multi-nucleotide polymorphism (MNP) + * + * @param genomeLocParser parse the genome locations + * @param vc1 variant context 1 + * @param vc2 variant context 2 + * @param referenceFile sequence file containing the reference genome + * @param alleleMergeRule rule for merging variants + * @return merged variant or null if the variants are NOT an SNP or MNP, on the same contig, variant location 1 is the same or after the variant location 2, + * their genotypes do NOT have the same number of chromosomes, haplotype, number of attributes as chromosomes, are both hetrozygous or do not abide by the merge rule + */ static VariantContext mergeIntoMNP(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile, AlleleMergeRule alleleMergeRule) { + + // Check if variants are an SNP or MNP, on the same contig, variant location 1 is not before variant location 2 if (!mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2)) return null; - // Check that it's logically possible to merge the VCs: + // Check if variant genotypes have the same number of chromosomes, haplotype, number of attributes as chromosomes, and either genotype is homozygous if (!allSamplesAreMergeable(vc1, vc2)) return null; @@ -106,63 +96,95 @@ class PhasingUtils { return reallyMergeIntoMNP(vc1, vc2, referenceFile); } - // Assumes: alleleSegregationIsKnown(gt1, gt2) + /** + * Find the alleles with the same haplotype + * assumes alleleSegregationIsKnown + * TODO - should alleleSegregationIsKnown be called within this method? + * + * @param gt1 genotype 1 + * @param gt2 genotype 2 + * @return gt1 and gt2 alleles with the same haplotype + */ static SameHaplotypeAlleles matchHaplotypeAlleles(final Genotype gt1, final Genotype gt2) { + final SameHaplotypeAlleles hapAlleles = new SameHaplotypeAlleles(); - final int nalleles = gt1.getPloidy(); - final Allele[] site1AllelesArray = gt1.getAlleles().toArray(new Allele[nalleles]); - final Allele[] site2AllelesArray = gt2.getAlleles().toArray(new Allele[nalleles]); + // Get the alleles + final int numAlleles = gt1.getPloidy(); + final Allele[] site1AllelesArray = gt1.getAlleles().toArray(new Allele[numAlleles]); + final Allele[] site2AllelesArray = gt2.getAlleles().toArray(new Allele[numAlleles]); - final int[] site2Inds = new int[nalleles]; - if (gt1.hasAnyAttribute(ReadBackedPhasing.HP_KEY) && gt2.hasAnyAttribute(ReadBackedPhasing.HP_KEY)) { - final String[] hp1 = (String[]) gt1.getAnyAttribute(ReadBackedPhasing.HP_KEY); - final String[] hp2 = (String[]) gt2.getAnyAttribute(ReadBackedPhasing.HP_KEY); + // locations of the same HP attribute in gt2 to gt2 + final int[] site1ToSite2Inds = new int[numAlleles]; - final HashMap site1Inds = new HashMap(); + if (gt1.hasAnyAttribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY) && gt2.hasAnyAttribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY)) { + final String[] hp1 = (String[]) gt1.getAnyAttribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY); + final String[] hp2 = (String[]) gt2.getAnyAttribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY); + + // Map of HP attribute to it's array index + final HashMap hpNameToSite1Inds = new HashMap(); + + // Hp name to index for (int ind1 = 0; ind1 < hp1.length; ++ind1) { - final String h1 = hp1[ind1]; - site1Inds.put(h1, ind1); + hpNameToSite1Inds.put(hp1[ind1], ind1); } + // Find the index of the gt2 HP attribute in gt1 HP attribute array for (int ind2 = 0; ind2 < hp2.length; ++ind2) { - final String h2 = hp2[ind2]; - final int ind1 = site1Inds.get(h2); + final int ind1 = hpNameToSite1Inds.get(hp2[ind2]); + + // attributes are not in the same position in both genotypes if (ind2 != ind1) hapAlleles.requiresSwap = true; - site2Inds[ind2] = ind1; // this is OK, since allSamplesAreMergeable() + + site1ToSite2Inds[ind1] = ind2; } } else { // gt1.isHom() || gt2.isHom() ; so, we trivially merge the corresponding alleles - for (int ind = 0; ind < site2Inds.length; ++ind) - site2Inds[ind] = ind; + for (int ind = 0; ind < site1ToSite2Inds.length; ++ind) + site1ToSite2Inds[ind] = ind; } - for (int ind1 = 0; ind1 < nalleles; ++ind1) { + // Get the alleles for gt1 and gt2 with the same haplotype + for (int ind1 = 0; ind1 < numAlleles; ++ind1) { final Allele all1 = site1AllelesArray[ind1]; - final int ind2 = site2Inds[ind1]; + final int ind2 = site1ToSite2Inds[ind1]; final Allele all2 = site2AllelesArray[ind2]; // this is OK, since alleleSegregationIsKnown(gt1, gt2) + // add the 2 alleles hapAlleles.hapAlleles.add(new AlleleOneAndTwo(all1, all2)); } return hapAlleles; } + /** + * Merge variants into a multi-nucleotide polymorphism (MNP) + * + * @param vc1 variant context 1 + * @param vc2 variant context 2 + * @param referenceFile sequence file containing the reference genome + * @return variant with the merged MNP + */ static VariantContext reallyMergeIntoMNP(VariantContext vc1, VariantContext vc2, ReferenceSequenceFile referenceFile) { final int startInter = vc1.getEnd() + 1; final int endInter = vc2.getStart() - 1; byte[] intermediateBases = null; + + // get bases between vc1 and vc2 in the reference sequence if (startInter <= endInter) { intermediateBases = referenceFile.getSubsequenceAt(vc1.getChr(), startInter, endInter).getBases(); StringUtil.toUpperCase(intermediateBases); } + + // merge the reference bases with vc1 and vc2 final MergedAllelesData mergeData = new MergedAllelesData(intermediateBases, vc1, vc2); // ensures that the reference allele is added final GenotypesContext mergedGenotypes = GenotypesContext.create(); for (final Genotype gt1 : vc1.getGenotypes()) { final Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); + // Alleles with the same haplotype final SameHaplotypeAlleles hapAlleles = matchHaplotypeAlleles(gt1, gt2); boolean isPhased = gt1.isPhased() && gt2.isPhased(); @@ -179,61 +201,83 @@ class PhasingUtils { final Map mergedGtAttribs = new HashMap(); + // get the min read backed phasing quality double PQ = Double.MAX_VALUE; - if (gt1.hasAnyAttribute(ReadBackedPhasing.PQ_KEY)) { - PQ = Math.min(PQ, (double) gt1.getAnyAttribute(ReadBackedPhasing.PQ_KEY)); + if (gt1.hasAnyAttribute(VCFConstants.PHASE_QUALITY_KEY)) { + PQ = Math.min(PQ, (double) gt1.getAnyAttribute(VCFConstants.PHASE_QUALITY_KEY)); } - if (gt2.hasAnyAttribute(ReadBackedPhasing.PQ_KEY)) { - PQ = Math.min(PQ, (double) gt2.getAnyAttribute(ReadBackedPhasing.PQ_KEY)); + if (gt2.hasAnyAttribute(VCFConstants.PHASE_QUALITY_KEY)) { + PQ = Math.min(PQ, (double) gt2.getAnyAttribute(VCFConstants.PHASE_QUALITY_KEY)); } if (PQ != Double.MAX_VALUE) - mergedGtAttribs.put(ReadBackedPhasing.PQ_KEY, PQ); + mergedGtAttribs.put(VCFConstants.PHASE_QUALITY_KEY, PQ); - if (gt1.hasAnyAttribute(ReadBackedPhasing.HP_KEY)) { - mergedGtAttribs.put(ReadBackedPhasing.HP_KEY, gt1.getAnyAttribute(ReadBackedPhasing.HP_KEY)); + if (gt1.hasAnyAttribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY)) { + mergedGtAttribs.put(GATKVCFConstants.RBP_HAPLOTYPE_KEY, gt1.getAnyAttribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY)); } - else if (gt2.hasAnyAttribute(ReadBackedPhasing.HP_KEY)) { // gt1 doesn't have, but merged (so gt1 is hom and can take gt2's haplotype names): - mergedGtAttribs.put(ReadBackedPhasing.HP_KEY, gt2.getAnyAttribute(ReadBackedPhasing.HP_KEY)); + else if (gt2.hasAnyAttribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY)) { // gt1 doesn't have, but merged (so gt1 is hom and can take gt2's haplotype names): + mergedGtAttribs.put(GATKVCFConstants.RBP_HAPLOTYPE_KEY, gt2.getAnyAttribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY)); } + // make the merged genotype final Genotype mergedGt = new GenotypeBuilder(gt1.getSampleName(), mergedAllelesForSample).log10PError(mergedGQ).attributes(mergedGtAttribs).phased(isPhased).make(); mergedGenotypes.add(mergedGt); } + // get the merged name final String mergedName = mergeVariantContextNames(vc1.getSource(), vc2.getSource()); final double mergedLog10PError = Math.min(vc1.getLog10PError(), vc2.getLog10PError()); final Set mergedFilters = new HashSet(); // Since vc1 and vc2 were unfiltered, the merged record remains unfiltered final Map mergedAttribs = mergeVariantContextAttributes(vc1, vc2); - // ids + // get the merged ID final List mergedIDs = new ArrayList(); if ( vc1.hasID() ) mergedIDs.add(vc1.getID()); if ( vc2.hasID() ) mergedIDs.add(vc2.getID()); final String mergedID = mergedIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(VCFConstants.ID_FIELD_SEPARATOR, mergedIDs); + // make the merged variant context final VariantContextBuilder mergedBuilder = new VariantContextBuilder(mergedName, vc1.getChr(), vc1.getStart(), vc2.getEnd(), mergeData.getAllMergedAlleles()).id(mergedID).genotypes(mergedGenotypes).log10PError(mergedLog10PError).filters(mergedFilters).attributes(mergedAttribs); VariantContextUtils.calculateChromosomeCounts(mergedBuilder, true); return mergedBuilder.make(); } + /** + * Merge variant context names + * + * @param name1 variant context 1 name + * @param name2 variant context 2 name + * @return merged variant names (name1_name2) + */ static String mergeVariantContextNames(String name1, String name2) { return name1 + "_" + name2; } + /** + * Get preset attributes and that are in vc1 or vc2 + * TODO: Will always return an empty map because MERGE_OR_ATTRIBS is empty + * + * @param vc1 variant context 1 + * @param vc2 variant context 2 + * @return merged attributes in vc1 or vc2 + */ static Map mergeVariantContextAttributes(VariantContext vc1, VariantContext vc2) { + // Map of attribute name to value Map mergedAttribs = new HashMap(); final List vcList = new LinkedList(); vcList.add(vc1); vcList.add(vc2); + // Attribute of interest //String[] MERGE_OR_ATTRIBS = {VCFConstants.DBSNP_KEY}; final String[] MERGE_OR_ATTRIBS = {}; for (String orAttrib : MERGE_OR_ATTRIBS) { boolean attribVal = false; for (VariantContext vc : vcList) { + // Does the variant have the attribute? attribVal = vc.getAttributeAsBoolean(orAttrib, false); - if (attribVal) // already true, so no reason to continue: + if ( attribVal ) // already true, so no reason to continue: break; } mergedAttribs.put(orAttrib, attribVal); @@ -242,25 +286,42 @@ class PhasingUtils { return mergedAttribs; } + /** + * Check if variants can be merged into the multi-nucleotide polymorphism (MNP) + * + * @param genomeLocParser parse the genome locations + * @param vc1 variant context 1 + * @param vc2 variant context 2 + * @return true if variants are an SNP or MNP, on the same contig, variant location 1 is not before variant location 2, unfiltered, from the same sample set and are called, + * false otherwise + */ static boolean mergeIntoMNPvalidationCheck(GenomeLocParser genomeLocParser, VariantContext vc1, VariantContext vc2) { - if (!vc1.isSNP() || !vc2.isSNP()) + // Can only merge "simple" base strings (i.e., SNPs or MNPs, but not indels): + final boolean vc1CanBeMerged = vc1.isSNP() || vc1.isMNP(); + final boolean vc2CanBeMerged = vc2.isSNP() || vc2.isMNP(); + if (!vc1CanBeMerged || !vc2CanBeMerged) return false; final GenomeLoc loc1 = GATKVariantContextUtils.getLocation(genomeLocParser, vc1); final GenomeLoc loc2 = GATKVariantContextUtils.getLocation(genomeLocParser, vc2); + // Must be on same contig if (!loc1.onSameContig(loc2)) return false; + // Variant 1 location must not be before variant context 2 if (!loc1.isBefore(loc2)) return false; + // Variants can not be filtered if (vc1.isFiltered() || vc2.isFiltered()) return false; - if (!vc1.getSampleNames().equals(vc2.getSampleNames())) // vc1, vc2 refer to different sample sets + // Variants must come from the same sample set + if (!vc1.getSampleNames().equals(vc2.getSampleNames())) return false; + // All of the variant genotypes must be unfiltered and called if (!allGenotypesAreUnfilteredAndCalled(vc1) || !allGenotypesAreUnfilteredAndCalled(vc2)) return false; @@ -276,10 +337,19 @@ class PhasingUtils { return true; } + /** + * Check if can merge genotypes from the same sample + * + * @param vc1 variant context 1 + * @param vc2 variant context 2 + * @return true if variants are phased or either is a homozygous, false otherwise + */ static boolean allSamplesAreMergeable(VariantContext vc1, VariantContext vc2) { - // Check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1: + // Check that each sample's genotype in vc2 is uniquely appendable onto its genotype in vc1 for (final Genotype gt1 : vc1.getGenotypes()) { final Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); + if ( gt2 == null ) // gt2 does not have sample name + return false; if (!alleleSegregationIsKnown(gt1, gt2)) // can merge if: phased, or if either is a hom return false; @@ -288,44 +358,76 @@ class PhasingUtils { return true; } + /** + * Check if the allele segregation is known + * + * @param gt1 genotype 1 + * @param gt2 genotype 2 + * @return true if genotypes have the same number of chromosomes, haplotype, number of attributes + * as chromosomes, and either genotype is homozygous, false otherwise + */ static boolean alleleSegregationIsKnown(Genotype gt1, Genotype gt2) { + // If gt1 or gt2 do not have the same number of chromosomes, then can not be merged. if (gt1.getPloidy() != gt2.getPloidy()) return false; - // If gt1 or gt2 are hom, then could be MERGED: + // If gt1 or gt2 are homozygous, then could be merged. if (gt1.isHom() || gt2.isHom()) return true; - // Otherwise, need to check that alleles from gt1 can be matched up with alleles from gt2: - if (!gt1.hasAnyAttribute(ReadBackedPhasing.HP_KEY) || !gt2.hasAnyAttribute(ReadBackedPhasing.HP_KEY)) + // If gt1 or gt2 do not have a read backed phasing haplotype, then can not be merged + if (!gt1.hasAnyAttribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY) || !gt2.hasAnyAttribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY)) return false; - final String[] hp1 = (String[]) gt1.getAnyAttribute(ReadBackedPhasing.HP_KEY); - final String[] hp2 = (String[]) gt2.getAnyAttribute(ReadBackedPhasing.HP_KEY); + // If gt1 or gt2 do not same number of HP attributes as chromosomes, then can not be merged. + final String[] hp1 = (String[]) gt1.getAnyAttribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY); + final String[] hp2 = (String[]) gt2.getAnyAttribute(GATKVCFConstants.RBP_HAPLOTYPE_KEY); if (hp1.length != gt1.getPloidy() || hp2.length != gt2.getPloidy()) return false; - Arrays.sort(hp1); - Arrays.sort(hp2); - return (Arrays.equals(hp1, hp2)); // The haplotype names match (though possibly in a different order) + // gt1 and gt2 must have the same read backed phasing haplotype identifier attributes to be merged + final String[] hp1Copy = Arrays.copyOf(hp1, hp1.length); + final String[] hp2Copy = Arrays.copyOf(hp2, hp2.length); + Arrays.sort(hp1Copy); + Arrays.sort(hp2Copy); + return (Arrays.equals(hp1Copy, hp2Copy)); // The haplotype names match (though possibly in a different order) } + /** + * Check if some samples have double alternate alleles + * + * @param vc1 variant context 1 + * @param vc2 variant context 2 + * @return true if there is a sample with double alternate alleles, false otherwise + */ static boolean someSampleHasDoubleNonReferenceAllele(VariantContext vc1, VariantContext vc2) { for (final Genotype gt1 : vc1.getGenotypes()) { + // gt2 from the same sample as gt1 final Genotype gt2 = vc2.getGenotype(gt1.getSampleName()); - final SameHaplotypeAlleles hapAlleles = matchHaplotypeAlleles(gt1, gt2); - for (AlleleOneAndTwo all1all2 : hapAlleles.hapAlleles) { - if (all1all2.all1.isNonReference() && all1all2.all2.isNonReference()) // corresponding alleles are alternate - return true; + if ( gt2 != null ) { + // Find the alleles with the same haplotype + final SameHaplotypeAlleles hapAlleles = matchHaplotypeAlleles(gt1, gt2); + + // Find corresponding alternate alleles + for (AlleleOneAndTwo all1all2 : hapAlleles.hapAlleles) { + if (all1all2.all1.isNonReference() && all1all2.all2.isNonReference()) + return true; + } } } return false; } + /** + * Check that alleles at vc1 and at vc2 always segregate together in all samples (including reference) + * + * @param vc1 variant context 1 + * @param vc2 variant context 2 + * @return true if alleles segregate together, false otherwise + */ static boolean doubleAllelesSegregatePerfectlyAmongSamples(VariantContext vc1, VariantContext vc2) { - // Check that Alleles at vc1 and at vc2 always segregate together in all samples (including reference): final Map allele1ToAllele2 = new HashMap(); final Map allele2ToAllele1 = new HashMap(); @@ -359,6 +461,9 @@ class PhasingUtils { return true; } + /** + * Class for variants merging rules + */ abstract static class AlleleMergeRule { // vc1, vc2 are ONLY passed to allelesShouldBeMerged() if mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2) AND allSamplesAreMergeable(vc1, vc2): abstract public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2); @@ -368,8 +473,15 @@ class PhasingUtils { } } + /** + * Class for storing the alleles with the same haplotype + */ static class SameHaplotypeAlleles { + + /// Alleles are not in the same order public boolean requiresSwap; + + /// Lisgt of gthe 2 alleles with the same haplotype public List hapAlleles; public SameHaplotypeAlleles() { @@ -378,19 +490,41 @@ class PhasingUtils { } } + /** + * Class for holding 2 alleles + */ static class AlleleOneAndTwo { + /// allele 1 private Allele all1; + /// allele2 private Allele all2; + /** + * Constructor + * + * @param all1 allele 1 + * @param all2 allele 2 + */ public AlleleOneAndTwo(Allele all1, Allele all2) { this.all1 = all1; this.all2 = all2; } + /** + * Get the hah code for alleles 1 and 2 + * + * @return hash code for alleles 1 and 2 + */ public int hashCode() { return all1.hashCode() + all2.hashCode(); } + /** + * Check if equal to another 2 alleles + * + * @param other allele to compare to + * @return true if equal, false otherwise + */ public boolean equals(Object other) { if (!(other instanceof AlleleOneAndTwo)) return false; @@ -400,28 +534,60 @@ class PhasingUtils { } } + /** + * Class for merging alleles + */ static class MergedAllelesData { + /// merged alleles private Map mergedAlleles; + + /// bases between the alleles private byte[] intermediateBases; + + /// number of bases between the alleles private int intermediateLength; + /** + * Constructor + * + * @param intermediateBases array of bases + * @param vc1 variant context 1 + * @param vc2 variant context 2 + */ public MergedAllelesData(byte[] intermediateBases, VariantContext vc1, VariantContext vc2) { this.mergedAlleles = new HashMap(); // implemented equals() and hashCode() for AlleleOneAndTwo this.intermediateBases = intermediateBases; this.intermediateLength = this.intermediateBases != null ? this.intermediateBases.length : 0; + // merge the reference bases from vc1 before and vc2 after the reference (intermediate) bases this.ensureMergedAllele(vc1.getReference(), vc2.getReference(), true); } + /** + * Ensure that the alleles are merged. The merged allele is alternate. + * + * @param all1 allele 1 + * @param all2 allele 2 + * @return merged allele + */ public Allele ensureMergedAllele(Allele all1, Allele all2) { return ensureMergedAllele(all1, all2, false); // false <-> since even if all1+all2 = reference, it was already created in the constructor } - private Allele ensureMergedAllele(Allele all1, Allele all2, boolean creatingReferenceForFirstTime) { + /** + * Ensure that the alleles are merged. + * all1 is before all2, if there is a gap between them, join with the intermediate bases + * + * @param all1 allele 1 + * @param all2 allele 2 + * @param isRef if true, merged allele is reference, if false, merged allele is alternate + * @return merged allele + */ + private Allele ensureMergedAllele(Allele all1, Allele all2, boolean isRef) { AlleleOneAndTwo all12 = new AlleleOneAndTwo(all1, all2); Allele mergedAllele = mergedAlleles.get(all12); - if (mergedAllele == null) { + if (mergedAllele == null) { final byte[] bases1 = all1.getBases(); final byte[] bases2 = all2.getBases(); @@ -431,13 +597,18 @@ class PhasingUtils { System.arraycopy(intermediateBases, 0, mergedBases, bases1.length, intermediateLength); System.arraycopy(bases2, 0, mergedBases, bases1.length + intermediateLength, bases2.length); - mergedAllele = Allele.create(mergedBases, creatingReferenceForFirstTime); + mergedAllele = Allele.create(mergedBases, isRef); mergedAlleles.put(all12, mergedAllele); } return mergedAllele; } + /** + * Get all merged alleles + * + * @return set of merged alleles values + */ public Set getAllMergedAlleles() { return new HashSet(mergedAlleles.values()); } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasing.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasing.java index 0fe20e07d..ad46e191b 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasing.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasing.java @@ -58,17 +58,19 @@ import org.broadinstitute.gatk.utils.commandline.Hidden; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.engine.filters.MappingQualityZeroFilter; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.utils.sam.ReadUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.HasGenomeLocation; -import org.broadinstitute.gatk.utils.SampleUtils; import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.exceptions.UserException; @@ -82,16 +84,20 @@ import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory; import java.io.*; import java.util.*; -import static org.broadinstitute.gatk.utils.variant.GATKVCFUtils.getVCFHeadersFromRods; +import static org.broadinstitute.gatk.engine.GATKVCFUtils.getVCFHeadersFromRods; /** - * Walks along all variant ROD loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using upstream and downstream reads). + * Annotate physical phasing information * - * The current implementation works for diploid SNPs, and will transparently (but properly) ignore other sites. + *

            This tool identifies haplotypes based on the overlap between reads and uses this information to generate physical + * phasing information for variants within these haplotypes.

            * - * The underlying algorithm is based on building up 2^n local haplotypes, - * where n is the number of heterozygous SNPs in the local region we expected to find phase-informative reads (and assumes a maximum value of maxPhaseSites, a user parameter). - * Then, these 2^n haplotypes are used to determine, with sufficient certainty (the assigned PQ score), to which haplotype the alleles of a genotype at a particular locus belong (denoted by the HP tag). + *

            It operates by walking along all variant ROD loci, caching a user-defined window of VariantContext sites, and + * then finishes phasing them when they go out of range (using upstream and downstream reads). The underlying algorithm + * is based on building up 2^n local haplotypes, where n is the number of heterozygous SNPs in the local region we + * expected to find phase-informative reads (and assumes a maximum value of maxPhaseSites, a user parameter). Then, + * these 2^n haplotypes are used to determine, with sufficient certainty (the assigned PQ score), to which haplotype + * the alleles of a genotype at a particular locus belong (denoted by the HP tag).

            * *

            * Performs physical phasing of SNP calls, based on sequencing reads. @@ -107,19 +113,21 @@ import static org.broadinstitute.gatk.utils.variant.GATKVCFUtils.getVCFHeadersFr * Phased VCF file. *

            * - *

            Examples

            + *

            Usage example

            *
            - *    java
            - *      -jar GenomeAnalysisTK.jar
            - *      -T ReadBackedPhasing
            - *      -R reference.fasta
            - *      -I reads.bam
            - *      --variant SNPs.vcf
            - *      -L SNPs.vcf
            - *      -o phased_SNPs.vcf
            + *    java -jar GenomeAnalysisTK.jar \
            + *      -T ReadBackedPhasing \
            + *      -R reference.fasta \
            + *      -I reads.bam \
            + *      --variant SNPs.vcf \
            + *      -L SNPs.vcf \
            + *      -o phased_SNPs.vcf \
              *      --phaseQualityThresh 20.0
              * 
            * + *

            Caveat

            + *

            The current implementation works for diploid SNPs, and will transparently (but properly) ignore other sites.

            + * * @author Menachem Fromer * @since July 2010 */ @@ -179,15 +187,10 @@ public class ReadBackedPhasing extends RodWalker vcfSamples = new TreeSet(samplesToPhase == null ? rodNameToHeader.get(trackName).getGenotypeSamples() : samplesToPhase); writer.writeHeader(new VCFHeader(hInfo, vcfSamples)); - Set readSamples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + Set readSamples = ReadUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); readSamples.retainAll(vcfSamples); if (readSamples.isEmpty()) { String noPhaseString = "No common samples in VCF and BAM headers" + (samplesToPhase == null ? "" : " (limited to sampleToPhase parameters)") + ", so nothing could possibly be phased!"; @@ -320,7 +323,7 @@ public class ReadBackedPhasing extends RodWalker KEYS_TO_KEEP_IN_REDUCED_VCF = new HashSet(Arrays.asList(PQ_KEY)); + private static final Set KEYS_TO_KEEP_IN_REDUCED_VCF = new HashSet<>(Arrays.asList(VCFConstants.PHASE_QUALITY_KEY)); private VariantContext reduceVCToSamples(VariantContext vc, Set samplesToPhase) { // for ( String sample : samplesToPhase ) @@ -445,15 +448,15 @@ public class ReadBackedPhasing extends RodWalkerThis tool identifies all N cigar elements in sequence reads, and creates k+1 new reads + * (where k is the number of N cigar elements) that correspond to the segments of the original read beside/between + * the splicing events represented by the Ns in the original CIGAR. The first read includes the bases that are to the + * left of the first N element, while the part of the read that is to the right of the N (including the Ns) is hard + * clipped, and so on for the rest of the new reads.

            * + *

            Input

            + *

            + * One or more bam files. + *

            + * + *

            Output

            + *

            + * A single processed bam file. + *

            + * + *

            Usage example

            + *
            + * java -jar GenomeAnalysisTK.jar \
            + *   -T SplitNCigarReads \
            + *   -R reference.fasta \
            + *   -I input.bam \
            + *   -o output.bam \
            + *   -U ALLOW_N_CIGARS
            + *
            + * 

            Note

            + *

            When this tool is used as part of the RNAseq best practices, the command should include mapping quality + * reassignment. See the Best Practices documentation for details.

            * - * User: ami - * Date: 11/14/13 - * Time: 11:52 AM */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) @@ -141,8 +163,7 @@ public class SplitNCigarReads extends ReadWalker rnaReadTransformers = Collections.emptyList(); - + private List rnaReadTransformers = new ArrayList<>(); @Override @@ -156,7 +177,7 @@ public class SplitNCigarReads extends ReadWalkerGiven a set of variants, this tool will generate simulated reads that support the input variants.

            * - *

            Caveats

            - *

            For practical reasons, only bi-allelic variants that are not too close to the ends of contigs (< 1/2 read length) are supported; all others will simply be ignored.

            + *

            Caveat

            + *

            For practical reasons, only bi-allelic variants that are not too close to the ends of contigs + * (< 1/2 read length) are supported; all others will simply be ignored.

            * *

            Input

            *

            A VCF file containing variants.

            * *

            Output

            - *

            A BAM file containing simulated sequence reads that support the input variants, with the requested error rate and coverage depth.

            + *

            A BAM file containing simulated sequence reads that support the input variants, with the requested error rate + * and coverage depth.

            * - *

            Example

            + *

            Usage example

            *
            - * java -Xmx2g -jar GenomeAnalysisTK.jar \
            + * java -jar GenomeAnalysisTK.jar \
              *   -T SimulateReadsForVariants \
              *   -R reference.fasta \
              *   -V input_variants.vcf \
            @@ -106,7 +107,6 @@ import java.util.*;
              *
              */
             @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class}, gotoDev = HelpConstants.EB)
            -
             @Reference(window=@Window(start=-200,stop=200))
             public class SimulateReadsForVariants extends RodWalker {
                 private static Logger logger = Logger.getLogger(SimulateReadsForVariants.class);
            @@ -175,7 +175,7 @@ public class SimulateReadsForVariants extends RodWalker {
             
                 // randomness related variables
                 private static final long RANDOM_SEED = 1252863495;
            -    private static final Random ran = GenomeAnalysisEngine.getRandomGenerator();
            +    private static final Random ran = Utils.getRandomGenerator();
                 private Poisson poissonRandom = null;
             
                 // samples and read groups
            @@ -222,8 +222,7 @@ public class SimulateReadsForVariants extends RodWalker {
             
                     final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME);
                     if ( !NO_PG_TAG ) {
            -            final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("GATKText");
            -            programRecord.setProgramVersion(headerInfo.getString("org.broadinstitute.gatk.tools.version"));
            +            programRecord.setProgramVersion(CommandLineProgram.getVersionNumber());
                         programRecord.setCommandLine(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this));
                     }
                     header.setProgramRecords(Arrays.asList(programRecord));
            diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java
            index ac3dfcdd2..09346919a 100644
            --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java
            +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java
            @@ -57,13 +57,15 @@ import org.broadinstitute.gatk.engine.walkers.*;
             import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.FixedAFCalculatorProvider;
             import org.broadinstitute.gatk.utils.commandline.*;
             import org.broadinstitute.gatk.engine.CommandLineGATK;
            -import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
            -import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
            -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
            +import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
            +import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
            +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
             import org.broadinstitute.gatk.tools.walkers.genotyper.*;
            -import org.broadinstitute.gatk.utils.SampleUtils;
            +import org.broadinstitute.gatk.engine.SampleUtils;
             import org.broadinstitute.gatk.utils.help.HelpConstants;
            -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils;
            +import org.broadinstitute.gatk.engine.GATKVCFUtils;
            +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
            +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
             import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
             import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature;
             import htsjdk.variant.variantcontext.VariantContext;
            @@ -76,7 +78,7 @@ import java.util.Set;
             import static org.broadinstitute.gatk.utils.IndelUtils.isInsideExtendedIndel;
             
             /**
            - * Genotypes a dataset and validates the calls of another dataset using the Unified Genotyper.
            + * Genotype and validate a dataset and the calls of another dataset using the Unified Genotyper
              *
              *  

            Note that this is an old tool that makes use of the UnifiedGenotyper, which has since been * deprecated in favor of the HaplotypeCaller.

            @@ -180,36 +182,30 @@ import static org.broadinstitute.gatk.utils.IndelUtils.isInsideExtendedIndel; * *
      * - *

      Examples

      - *
        - *
      1. - * Genotypes BAM file from new technology using the VCF as a truth dataset: - *
      2. - * + *

        Usage examples

        + *

        Genotypes BAM file from new technology using the VCF as a truth dataset

        *
          *  java
        - *      -jar /GenomeAnalysisTK.jar
        - *      -T  GenotypeAndValidate
        - *      -R human_g1k_v37.fasta
        - *      -I myNewTechReads.bam
        - *      -alleles handAnnotatedVCF.vcf
        - *      -L handAnnotatedVCF.vcf
        + *      -jar GenomeAnalysisTK.jar \
        + *      -T  GenotypeAndValidate \
        + *      -R reference.fasta \
        + *      -I myNewTechReads.bam \
        + *      -alleles handAnnotatedVCF.vcf \
        + *      -L handAnnotatedVCF.vcf \
        + *      -o output.vcf
          * 
        * - *
      3. - * Using a BAM file as the truth dataset: - *
      4. - * + *

        Genotypes BAM file from new technology a BAM file as the truth dataset

        *
          *  java
        - *      -jar /GenomeAnalysisTK.jar
        - *      -T  GenotypeAndValidate
        - *      -R human_g1k_v37.fasta
        - *      -I myTruthDataset.bam
        - *      -alleles callsToValidate.vcf
        - *      -L callsToValidate.vcf
        - *      -bt
        - *      -o gav.vcf
        + *      -jar GenomeAnalysisTK.jar \
        + *      -T  GenotypeAndValidate \
        + *      -R reference.fasta \
        + *      -I myTruthDataset.bam \
        + *      -alleles callsToValidate.vcf \
        + *      -L callsToValidate.vcf \
        + *      -bt \
        + *      -o output.vcf
          * 
        * */ @@ -333,7 +329,7 @@ public class GenotypeAndValidate extends RodWalker headerLines = VCFUtils.smartMergeHeaders(header.values(), true); headerLines.add(new VCFHeaderLine("source", "GenotypeAndValidate")); - headerLines.add(new VCFInfoHeaderLine("callStatus", 1, VCFHeaderLineType.String, "Value from the validation VCF")); + headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.GENOTYPE_AND_VALIDATE_STATUS_KEY)); vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); } @@ -496,8 +492,8 @@ public class GenotypeAndValidate extends RodWalker numValidationSites) { // take off one from randomly selected bin - int k= GenomeAnalysisEngine.getRandomGenerator().nextInt(NUM_BINS); + int k= Utils.getRandomGenerator().nextInt(NUM_BINS); sitesToChoosePerBin[k]--; totalSites--; } while (totalSites < numValidationSites) { // take off one from randomly selected bin - int k= GenomeAnalysisEngine.getRandomGenerator().nextInt( NUM_BINS); + int k= Utils.getRandomGenerator().nextInt( NUM_BINS); sitesToChoosePerBin[k]++; totalSites++; } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/validationsiteselector/ValidationSiteSelector.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/validationsiteselector/ValidationSiteSelector.java index 744dd0623..01572780d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/validationsiteselector/ValidationSiteSelector.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/validationsiteselector/ValidationSiteSelector.java @@ -53,17 +53,17 @@ package org.broadinstitute.gatk.tools.walkers.validation.validationsiteselector; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.help.HelpConstants; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.writer.VariantContextWriter; @@ -73,22 +73,28 @@ import java.util.*; /** - * Randomly selects VCF records according to specified options. + * Randomly select variant records according to specified options * *

        - * ValidationSiteSelectorWalker is intended for use in experiments where we sample data randomly from a set of variants, for example - * in order to choose sites for a follow-up validation study. + * This tool is intended for use in experiments where we sample data randomly from a set of variants, for example + * in order to choose sites for a follow-up validation study.

        * - * Sites are selected randomly but within certain restrictions. There are two main sources of restrictions - * a) Sample restrictions. A user can specify a set of samples, and we will only consider sites which are polymorphic within such given sample subset. - * These sample restrictions can be given as a set of individual samples, a text file (each line containing a sample name), or a regular expression. - * A user can additionally specify whether samples will be considered based on their genotypes (a non-reference genotype means that such sample is polymorphic in that variant, - * and hence that variant will be considered for inclusion in set), or based on their PLs. - * b) A user can additionally specify a sampling method based on allele frequency. Two sampling methods are currently supported. - * 1. Uniform sampling will just sample uniformly from variants polymorphic in selected samples. - * 2. Sampling based on Allele Frequency spectrum will ensure that output sites have the same AF distribution as the input set. - * - * User can additionally restrict output to a particular type of variant (SNP, Indel, etc.) + *

        Sites are selected randomly but within certain restrictions. There are two main sources of restrictions:

        + *
          + *
        • Sample restrictions: A user can specify a set of samples, and we will only consider sites which are + * polymorphic within the given sample subset. These sample restrictions can be given as a set of individual + * samples, a text file (each line containing a sample name), or a regular expression. A user can additionally + * specify whether samples will be considered based on their genotypes (a non-reference genotype means that the + * sample is polymorphic in that variant, and hence that variant will be considered for inclusion in set), or + * based on their PLs.
        • + *
        • Sampling methods: + *
            + *
          1. Uniform sampling will just sample uniformly from variants that are polymorphic in selected samples
          2. + *
          3. Sampling based on Allele Frequency spectrum will ensure that output sites have the same AF distribution as the input set
          4. + *
          + *
        • + *
        • Variant type (SNP, Indel, etc.)
        • + *
        * *

        Input

        *

        @@ -100,29 +106,30 @@ import java.util.*; * A sites-only VCF with the desired number of randomly selected sites. *

        * - *

        Examples

        + *

        Usage examples

        *
        - * java -Xmx2g -jar GenomeAnalysisTK.jar \
        - *   -R ref.fasta \
        + * java -jar GenomeAnalysisTK.jar \
          *   -T ValidationSiteSelectorWalker \
        - *   --variant input1.vcf \
        - *   --variant input2.vcf \
        + *   -R reference.fasta \
        + *   -V input1.vcf \
        + *   -V input2.vcf \
          *   -sn NA12878 \
          *   -o output.vcf \
          *   --numValidationSites 200   \
        - *   -sampleMode  POLY_BASED_ON_GT \
        + *   -sampleMode POLY_BASED_ON_GT \
          *   -freqMode KEEP_AF_SPECTRUM
        - *
        - * java -Xmx2g -jar GenomeAnalysisTK.jar \
        - *   -R ref.fasta \
        + * 
        + *
        + * java -jar GenomeAnalysisTK.jar \
          *   -T ValidationSiteSelectorWalker \
        - *   --variant:foo input1.vcf \
        - *   --variant:bar input2.vcf \
        + *   -R reference.fasta \
        + *   -V:foo input1.vcf \
        + *   -V:bar input2.vcf \
          *   --numValidationSites 200 \
          *   -sf samples.txt \
          *   -o output.vcf \
          *   -sampleMode  POLY_BASED_ON_GT \
        -  *   -freqMode UNIFORM
        +  *   -freqMode UNIFORM \
          *   -selectType INDEL
          * 
        * diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/ApplyRecalibration.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/ApplyRecalibration.java index 2c037bee1..2361ca64e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/ApplyRecalibration.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/ApplyRecalibration.java @@ -53,53 +53,71 @@ package org.broadinstitute.gatk.tools.walkers.variantrecalibration; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.PartitionBy; import org.broadinstitute.gatk.engine.walkers.PartitionType; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.engine.walkers.TreeReducible; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.io.File; import java.util.*; /** - * Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel truth sensitivity levels which were specified during VariantRecalibration + * Apply a score cutoff to filter variants based on a recalibration table * *

        - * Using the tranche file generated by the previous step the ApplyRecalibration walker looks at each variant's VQSLOD value + * This tool performs the second pass in a two-stage process called VQSR; the first pass is performed by the + * VariantRecalibrator tool. + * In brief, the first pass consists of creating a Gaussian mixture model by looking at the distribution of annotation + * values over a high quality subset of the input call set, and then scoring all input variants according to the model. + * The second pass consists of filtering variants based on score cutoffs identified in the first pass. + *

        + * + *

        + * Using the tranche file and recalibration table generated by the previous step, the ApplyRecalibration tool looks at each variant's VQSLOD value * and decides which tranche it falls in. Variants in tranches that fall below the specified truth sensitivity filter level - * have their filter field annotated with its tranche level. This will result in a call set that simultaneously is filtered - * to the desired level but also has the information necessary to pull out more variants for a higher sensitivity but a - * slightly lower quality level. + * have their FILTER field annotated with the corresponding tranche level. This will result in a call set that is filtered + * to the desired level but retains the information necessary to increase sensitivity if needed.

        + * + *

        To be clear, please note that by "filtered", we mean that variants failing the requested tranche cutoff are marked + * as filtered in the output VCF; they are not discarded.

        + * + *

        VQSR is probably the hardest part of the Best Practices to get right, so be sure to read the + * method documentation, + * parameter recommendations and + * tutorial to really understand what these + * tools and how to use them for best results on your own data.

        * *

        Input

        - *

        - * The input raw variants to be recalibrated. - *

        - * The recalibration table file in VCF format that was generated by the VariantRecalibrator walker. - *

        - * The tranches file that was generated by the VariantRecalibrator walker. + *

          + *
        • The raw input variants to be filtered.
        • + *
        • The recalibration table file that was generated by the VariantRecalibrator tool.
        • + *
        • The tranches file that was generated by the VariantRecalibrator tool.
        • + *
        * *

        Output

        - *

        - * A recalibrated VCF file in which each variant is annotated with its VQSLOD and filtered if the score is below the desired quality level. + *

          + *
        • A recalibrated VCF file in which each variant of the requested type is annotated with its VQSLOD and marked as filtered if the score is below the desired quality level.
        • + *
        * - *

        Examples

        + *

        Usage example for filtering SNPs

        *
          * java -Xmx3g -jar GenomeAnalysisTK.jar \
          *   -T ApplyRecalibration \
        - *   -R reference/human_g1k_v37.fasta \
        + *   -R reference.fasta \
          *   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.subset.b37.vcf \
          *   --ts_filter_level 99.0 \
          *   -tranchesFile path/to/output.tranches \
        @@ -108,6 +126,16 @@ import java.util.*;
          *   -o path/to/output.recalibrated.filtered.vcf
          * 
        * + *

        Caveats

        + * + *
          + *
        • The tranche values used in the example above is only a general example. You should determine the level of sensitivity + * that is appropriate for your specific project. Remember that higher sensitivity (more power to detect variants, yay!) comes + * at the cost of specificity (more false negatives, boo!). You have to choose at what point you want to set the tradeoff.
        • + *
        • In order to create the tranche reporting plots (which are only generated for SNPs, not indels!) Rscript needs to be + * in your environment PATH (this is the scripting version of R, not the interactive version). + * See http://www.r-project.org for more info on how to download and install R.
        • + *
        */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) @@ -139,7 +167,6 @@ public class ApplyRecalibration extends RodWalker implements T ///////////////////////////// // Command Line Arguments ///////////////////////////// - @Advanced @Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering", required=false) protected Double TS_FILTER_LEVEL = null; @Advanced @@ -230,10 +257,10 @@ public class ApplyRecalibration extends RodWalker implements T public static void addVQSRStandardHeaderLines(final Set hInfo) { hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); - hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model")); - hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.CULPRIT_KEY, 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out")); - hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.POSITIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the positive training set of good variants")); - hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.NEGATIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the negative training set of bad variants")); + hInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.VQS_LOD_KEY)); + hInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.CULPRIT_KEY)); + hInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.POSITIVE_LABEL_KEY)); + hInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.NEGATIVE_LABEL_KEY)); } //--------------------------------------------------------------------------------------------------------------- @@ -260,7 +287,7 @@ public class ApplyRecalibration extends RodWalker implements T throw new UserException("Encountered input variant which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants. First seen at: " + vc ); } - final String lodString = recalDatum.getAttributeAsString(VariantRecalibrator.VQS_LOD_KEY, null); + final String lodString = recalDatum.getAttributeAsString(GATKVCFConstants.VQS_LOD_KEY, null); if( lodString == null ) { throw new UserException("Encountered a malformed record in the input recal file. There is no lod for the record at: " + vc ); } @@ -274,12 +301,12 @@ public class ApplyRecalibration extends RodWalker implements T VariantContextBuilder builder = new VariantContextBuilder(vc); // Annotate the new record with its VQSLOD and the worst performing annotation - builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lod); - builder.attribute(VariantRecalibrator.CULPRIT_KEY, recalDatum.getAttribute(VariantRecalibrator.CULPRIT_KEY)); - if ( recalDatum.hasAttribute(VariantRecalibrator.POSITIVE_LABEL_KEY)) - builder.attribute(VariantRecalibrator.POSITIVE_LABEL_KEY, true); - if ( recalDatum.hasAttribute(VariantRecalibrator.NEGATIVE_LABEL_KEY)) - builder.attribute(VariantRecalibrator.NEGATIVE_LABEL_KEY, true); + builder.attribute(GATKVCFConstants.VQS_LOD_KEY, lod); + builder.attribute(GATKVCFConstants.CULPRIT_KEY, recalDatum.getAttribute(GATKVCFConstants.CULPRIT_KEY)); + if ( recalDatum.hasAttribute(GATKVCFConstants.POSITIVE_LABEL_KEY)) + builder.attribute(GATKVCFConstants.POSITIVE_LABEL_KEY, true); + if ( recalDatum.hasAttribute(GATKVCFConstants.NEGATIVE_LABEL_KEY)) + builder.attribute(GATKVCFConstants.NEGATIVE_LABEL_KEY, true); final String filterString = generateFilterString(lod); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/GaussianMixtureModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/GaussianMixtureModel.java index 08a5865d7..1eb555f2c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/GaussianMixtureModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/GaussianMixtureModel.java @@ -53,8 +53,8 @@ package org.broadinstitute.gatk.tools.walkers.variantrecalibration; import Jama.Matrix; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.Utils; import java.util.ArrayList; import java.util.Arrays; @@ -101,7 +101,7 @@ public class GaussianMixtureModel { // initialize random Gaussian means // BUGBUG: this is broken up this way to match the order of calls to rand.nextDouble() in the old code for( final MultivariateGaussian gaussian : gaussians ) { - gaussian.initializeRandomMu( GenomeAnalysisEngine.getRandomGenerator() ); + gaussian.initializeRandomMu( Utils.getRandomGenerator() ); } // initialize means using K-means algorithm @@ -112,7 +112,7 @@ public class GaussianMixtureModel { for( final MultivariateGaussian gaussian : gaussians ) { gaussian.pMixtureLog10 = Math.log10( 1.0 / ((double) gaussians.size()) ); gaussian.sumProb = 1.0 / ((double) gaussians.size()); - gaussian.initializeRandomSigma( GenomeAnalysisEngine.getRandomGenerator() ); + gaussian.initializeRandomSigma( Utils.getRandomGenerator() ); gaussian.hyperParameter_a = priorCounts; gaussian.hyperParameter_b = shrinkage; gaussian.hyperParameter_lambda = dirichletParameter; @@ -152,7 +152,7 @@ public class GaussianMixtureModel { if( numAssigned != 0 ) { gaussian.divideEqualsMu( ((double) numAssigned) ); } else { - gaussian.initializeRandomMu( GenomeAnalysisEngine.getRandomGenerator() ); + gaussian.initializeRandomMu( Utils.getRandomGenerator() ); } } } @@ -279,7 +279,7 @@ public class GaussianMixtureModel { // if it is missing marginalize over the missing dimension by drawing X random values for the missing annotation and averaging the lod if( datum.isNull[iii] ) { for( int ttt = 0; ttt < numIterPerMissingAnnotation; ttt++ ) { - datum.annotations[iii] = GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); // draw a random sample from the standard normal distribution + datum.annotations[iii] = Utils.getRandomGenerator().nextGaussian(); // draw a random sample from the standard normal distribution // evaluate this random data point int gaussianIndex = 0; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/TrancheManager.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/TrancheManager.java index 52b5c923c..636dd8ece 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/TrancheManager.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/TrancheManager.java @@ -165,12 +165,12 @@ public class TrancheManager { } } - public static List findTranches( final List data, final double[] tranches, final SelectionMetric metric, final VariantRecalibratorArgumentCollection.Mode model ) { + public static List findTranches( final List data, final List tranches, final SelectionMetric metric, final VariantRecalibratorArgumentCollection.Mode model ) { return findTranches( data, tranches, metric, model, null ); } - public static List findTranches( final List data, final double[] trancheThresholds, final SelectionMetric metric, final VariantRecalibratorArgumentCollection.Mode model, final File debugFile ) { - logger.info(String.format("Finding %d tranches for %d variants", trancheThresholds.length, data.size())); + public static List findTranches( final List data, final List trancheThresholds, final SelectionMetric metric, final VariantRecalibratorArgumentCollection.Mode model, final File debugFile ) { + logger.info(String.format("Finding %d tranches for %d variants", trancheThresholds.size(), data.size())); Collections.sort( data, new VariantDatum.VariantDatumLODComparator() ); metric.calculateRunningMetric(data); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDataManager.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDataManager.java index 023d64f7f..febef6138 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDataManager.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDataManager.java @@ -53,8 +53,8 @@ package org.broadinstitute.gatk.tools.walkers.variantrecalibration; import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.MathUtils; import htsjdk.variant.vcf.VCFConstants; @@ -66,6 +66,7 @@ import org.broadinstitute.gatk.utils.exceptions.UserException; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import java.util.*; @@ -116,7 +117,7 @@ public class VariantDataManager { varianceVector[iii] = theSTD; for( final VariantDatum datum : data ) { // Transform each data point via: (x - mean) / standard deviation - datum.annotations[iii] = ( datum.isNull[iii] ? 0.1 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD ); + datum.annotations[iii] = ( datum.isNull[iii] ? 0.1 * Utils.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD ); } } if( foundZeroVarianceAnnotation ) { @@ -251,7 +252,7 @@ public class VariantDataManager { logger.warn( "WARNING: Training with very few variant sites! Please check the model reporting PDF to ensure the quality of the model is reliable." ); } else if( trainingData.size() > VRAC.MAX_NUM_TRAINING_DATA ) { logger.warn( "WARNING: Very large training set detected. Downsampling to " + VRAC.MAX_NUM_TRAINING_DATA + " training variants." ); - Collections.shuffle(trainingData, GenomeAnalysisEngine.getRandomGenerator()); + Collections.shuffle(trainingData, Utils.getRandomGenerator()); return trainingData.subList(0, VRAC.MAX_NUM_TRAINING_DATA); } return trainingData; @@ -299,13 +300,13 @@ public class VariantDataManager { public List getRandomDataForPlotting( final int numToAdd, final List trainingData, final List antiTrainingData, final List evaluationData ) { final List returnData = new ExpandingArrayList<>(); - Collections.shuffle(trainingData, GenomeAnalysisEngine.getRandomGenerator()); - Collections.shuffle(antiTrainingData, GenomeAnalysisEngine.getRandomGenerator()); - Collections.shuffle(evaluationData, GenomeAnalysisEngine.getRandomGenerator()); + Collections.shuffle(trainingData, Utils.getRandomGenerator()); + Collections.shuffle(antiTrainingData, Utils.getRandomGenerator()); + Collections.shuffle(evaluationData, Utils.getRandomGenerator()); returnData.addAll(trainingData.subList(0, Math.min(numToAdd, trainingData.size()))); returnData.addAll(antiTrainingData.subList(0, Math.min(numToAdd, antiTrainingData.size()))); returnData.addAll(evaluationData.subList(0, Math.min(numToAdd, evaluationData.size()))); - Collections.shuffle(returnData, GenomeAnalysisEngine.getRandomGenerator()); + Collections.shuffle(returnData, Utils.getRandomGenerator()); return returnData; } @@ -349,10 +350,10 @@ public class VariantDataManager { try { value = vc.getAttributeAsDouble( annotationKey, Double.NaN ); if( Double.isInfinite(value) ) { value = Double.NaN; } - if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); } - if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); } - if( jitter && annotationKey.equalsIgnoreCase("InbreedingCoeff") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); } - if( jitter && annotationKey.equalsIgnoreCase("SOR") && MathUtils.compareDoubles(value, LOG_OF_TWO, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); } //min SOR is 2.0, then we take ln + if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } + if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } + if( jitter && annotationKey.equalsIgnoreCase("InbreedingCoeff") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } + if( jitter && annotationKey.equalsIgnoreCase("SOR") && MathUtils.compareDoubles(value, LOG_OF_TWO, 0.01) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } //min SOR is 2.0, then we take ln } catch( Exception e ) { value = Double.NaN; // The VQSR works with missing data by marginalizing over the missing dimension when evaluating the Gaussian mixture model } @@ -428,11 +429,11 @@ public class VariantDataManager { for( final VariantDatum datum : data ) { VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles); builder.attribute(VCFConstants.END_KEY, datum.loc.getStop()); - builder.attribute(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod)); - builder.attribute(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")); + builder.attribute(GATKVCFConstants.VQS_LOD_KEY, String.format("%.4f", datum.lod)); + builder.attribute(GATKVCFConstants.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")); - if ( datum.atTrainingSite ) builder.attribute(VariantRecalibrator.POSITIVE_LABEL_KEY, true); - if ( datum.atAntiTrainingSite ) builder.attribute(VariantRecalibrator.NEGATIVE_LABEL_KEY, true); + if ( datum.atTrainingSite ) builder.attribute(GATKVCFConstants.POSITIVE_LABEL_KEY, true); + if ( datum.atAntiTrainingSite ) builder.attribute(GATKVCFConstants.NEGATIVE_LABEL_KEY, true); recalWriter.add(builder.make()); } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java index 1cdcb5f5f..8021db111 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java @@ -53,10 +53,9 @@ package org.broadinstitute.gatk.tools.walkers.variantrecalibration; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.PartitionBy; import org.broadinstitute.gatk.engine.walkers.PartitionType; import org.broadinstitute.gatk.engine.walkers.RodWalker; @@ -81,60 +80,74 @@ import java.io.PrintStream; import java.util.*; /** - * Create a Gaussian mixture model by looking at the annotations values over a high quality subset of the input call set and then evaluate all input variants. + * Build a recalibration model to score variant quality for filtering purposes * *

        - * This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with the ApplyRecalibration walker. + * This tool performs the first pass in a two-stage process called VQSR; the second pass is performed by the + * ApplyRecalibration tool. + * In brief, the first pass consists of creating a Gaussian mixture model by looking at the distribution of annotation + * values over a high quality subset of the input call set, and then scoring all input variants according to the model. + * The second pass consists of filtering variants based on score cutoffs identified in the first pass. *

        * *

        * The purpose of the variant recalibrator is to assign a well-calibrated probability to each variant call in a call set. * You can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call. * The approach taken by variant quality score recalibration is to develop a continuous, covarying estimate of the relationship - * between SNP call annotations (QD, MQ, HaplotypeScore, and ReadPosRankSum, for example) and the probability that a SNP is a true genetic + * between SNP call annotations (such as QD, MQ, and ReadPosRankSum, for example) and the probability that a SNP is a true genetic * variant versus a sequencing or data processing artifact. This model is determined adaptively based on "true sites" provided - * as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array. This adaptive + * as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array (in humans). This adaptive * error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the * probability that each call is real. The score that gets added to the INFO field of each variant is called the VQSLOD. It is * the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model. *

        * + *

        VQSR is probably the hardest part of the Best Practices to get right, so be sure to read the + * method documentation, + * parameter recommendations and + * tutorial to really understand what these + * tools and how to use them for best results on your own data.

        + * *

        Inputs

        - *

        - * The input raw variants to be recalibrated. - *

        - * Known, truth, and training sets to be used by the algorithm. How these various sets are used is described below. + *

          + *
        • The input raw variants to be recalibrated.
        • + *
        • Known, truth, and training sets to be used by the algorithm. See the method documentation for more details.
        • + *
        * *

        Output

        - *

        - * A recalibration table file in VCF format that is used by the ApplyRecalibration walker. - *

        - * A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data. + *

          + *
        • A recalibration table file that will be used by the ApplyRecalibration tool.
        • + *
        • A tranches file which shows various metrics of the recalibration callset for slices of the data.
        • + *
        * - *

        Example

        + *

        Usage example

        + *

        Recalibrating SNPs in exome data:

        *
          * java -Xmx4g -jar GenomeAnalysisTK.jar \
          *   -T VariantRecalibrator \
        - *   -R reference/human_g1k_v37.fasta \
        - *   -input NA12878.HiSeq.WGS.bwa.cleaned.raw.subset.b37.vcf \
        + *   -R reference.fasta \
        + *   -input raw_variants.vcf \
          *   -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
          *   -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \
        + *   -resource:1000G,known=false,training=true,truth=false,prior=10.0 1000G_phase1.snps.high_confidence.vcf
          *   -resource:dbsnp,known=true,training=false,truth=false,prior=6.0 dbsnp_135.b37.vcf \
        - *   -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ -an InbreedingCoeff \
        + *   -an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR -an InbreedingCoeff \
          *   -mode SNP \
        - *   -recalFile path/to/output.recal \
        - *   -tranchesFile path/to/output.tranches \
        - *   -rscriptFile path/to/output.plots.R
        + *   -recalFile output.recal \
        + *   -tranchesFile output.tranches \
        + *   -rscriptFile output.plots.R
          * 
        * - *

        Caveat

        + *

        Caveats

        * *
          *
        • The values used in the example above are only meant to show how the command lines are composed. * They are not meant to be taken as specific recommendations of values to use in your own work, and they may be * different from the values cited elsewhere in our documentation. For the latest and greatest recommendations on - * how to set parameter values for you own analyses, please read the Best Practices section of the documentation.
        • - * + * how to set parameter values for you own analyses, please read the Best Practices section of the documentation, + * especially the FAQ document on VQSR parameters. + *
        • Whole genomes and exomes take slightly different parameters, so make sure you adapt your commands accordingly! See the documents linked above for details.
        • + *
        • If you work with small datasets (e.g. targeted capture experiments or small number of exomes), you will run into problems. Read the docs linked above for advice on how to deal with those issues.
        • *
        • In order to create the model reporting plots Rscript needs to be in your environment PATH (this is the scripting version of R, not the interactive version). * See http://www.r-project.org for more info on how to download and install R.
        • *
        @@ -144,10 +157,6 @@ import java.util.*; @PartitionBy(PartitionType.NONE) public class VariantRecalibrator extends RodWalker, ExpandingArrayList> implements TreeReducible> { - public static final String VQS_LOD_KEY = "VQSLOD"; // Log odds ratio of being a true variant versus being false under the trained gaussian mixture model - public static final String CULPRIT_KEY = "culprit"; // The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out - public static final String NEGATIVE_LABEL_KEY = "NEGATIVE_TRAIN_SITE"; // this variant was used in the negative training set - public static final String POSITIVE_LABEL_KEY = "POSITIVE_TRAIN_SITE"; // this variant was used in the positive training set private static final String PLOT_TRANCHES_RSCRIPT = "plot_Tranches.R"; @ArgumentCollection private VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection(); @@ -204,20 +213,20 @@ public class VariantRecalibrator extends RodWalker USE_ANNOTATIONS = new ArrayList(); /** * Add truth sensitivity slices through the call set at the given values. The default values are 100.0, 99.9, 99.0, and 90.0 * which will result in 4 estimated tranches in the final call set: the full set of calls (100% sensitivity at the accessible * sites in the truth set), a 99.9% truth sensitivity tranche, along with progressively smaller tranches at 99% and 90%. */ - @Argument(fullName="TStranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false) - private double[] TS_TRANCHES = new double[] {100.0, 99.9, 99.0, 90.0}; + @Argument(fullName="TStranche", shortName="tranche", doc="The levels of truth sensitivity at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false) + private List TS_TRANCHES = new ArrayList(Arrays.asList(100.0, 99.9, 99.0, 90.0)); /** * For this to work properly, the -ignoreFilter argument should also be applied to the ApplyRecalibration command. */ @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified, the variant recalibrator will also use variants marked as filtered by the specified filter name in the input VCF file", required=false) - private String[] IGNORE_INPUT_FILTERS = null; + private List IGNORE_INPUT_FILTERS = new ArrayList(); @Argument(fullName="ignore_all_filters", shortName="ignoreAllFilters", doc="If specified, the variant recalibrator will ignore all input filters. Useful to rerun the VQSR from a filtered output file.", required=false) private boolean IGNORE_ALL_FILTERS = false; @Output(fullName="rscript_file", shortName="rscriptFile", doc="The output rscript file generated by the VQSR to aid in visualization of the input data and learned model", required=false, defaultToStdout=false) @@ -251,7 +260,7 @@ public class VariantRecalibrator extends RodWalker(Arrays.asList(USE_ANNOTATIONS)), VRAC ); + dataManager = new VariantDataManager( new ArrayList<>(USE_ANNOTATIONS), VRAC ); if (RSCRIPT_FILE != null && !RScriptExecutor.RSCRIPT_EXISTS) Utils.warnUser(logger, String.format( @@ -259,7 +268,7 @@ public class VariantRecalibrator extends RodWalker * Given a VCF with genotype likelihoods from the HaplotypeCaller, UnifiedGenotyper, or another source which provides - * -unbiased- GLs, calculate the posterior genotype state and likelihood given allele frequency information from - * both the samples themselves and input VCFs describing allele frequencies in related populations. + * -unbiased- genotype likelihoods, calculate the posterior genotype state and likelihood given allele frequency + * information from both the samples themselves and input VCFs describing allele frequencies in related populations.

        * - * VCFs to use for informing the genotype likelihoods (e.g. a population-specific VCF from 1000 genomes) should have - * at least one of: - * - AC field and AN field - * - MLEAC field and AN field - * - genotypes - * - * The AF field will not be used in this calculation as it does not provide a way to estimate the confidence interval + *

        The AF field will not be used in this calculation as it does not provide a way to estimate the confidence interval * or uncertainty around the allele frequency, while AN provides this necessary information. This uncertainty is * modeled by a Dirichlet distribution: that is, the frequency is known up to a Dirichlet distribution with * parameters AC1+q,AC2+q,...,(AN-AC1-AC2-...)+q, where "q" is the global frequency prior (typically q << 1). The * genotype priors applied then follow a Dirichlet-Multinomial distribution, where 2 alleles per sample are drawn * independently. This assumption of independent draws is the assumption Hardy-Weinberg Equilibrium. Thus, HWE is - * imposed on the likelihoods as a result of CalculateGenotypePosteriors. + * imposed on the likelihoods as a result of CalculateGenotypePosteriors.

        * *

        Input

        *

        @@ -102,26 +98,28 @@ import java.util.*; *

      5. A VCF with genotype likelihoods, and optionally genotypes, AC/AN fields, or MLEAC/AN fields
      6. *
      7. (Optional) A PED pedigree file containing the description of the individuals relationships.
      8. *
    - * *

    * *

    * A collection of VCFs to use for informing allele frequency priors. Each VCF must have one of - * - AC field and AN field - * - MLEAC field and AN field - * - genotypes + *

    + *
      + *
    • AC field and AN field
    • + *
    • MLEAC field and AN field
    • + *
    • genotypes
    • + *
    *

    * *

    Output

    - *

    - * A new VCF with: - * 1) Genotype posteriors added to the genotype fields ("PP") - * 2) Genotypes and GQ assigned according to these posteriors - * 3) Per-site genotype priors added to the INFO field ("PG") - * 4) (Optional) Per-site, per-trio joint likelihoods (JL) and joint posteriors (JL) given as Phred-scaled probability + *

    A new VCF with:

    + *
      + *
    • Genotype posteriors added to the genotype fields ("PP")
    • + *
    • Genotypes and GQ assigned according to these posteriors
    • + *
    • Per-site genotype priors added to the INFO field ("PG")
    • + *
    • (Optional) Per-site, per-trio joint likelihoods (JL) and joint posteriors (JL) given as Phred-scaled probability * of all genotypes in the trio being correct based on the PLs for JL and the PPs for JP. These annotations are added to - * the genotype fields. - *

      + * the genotype fields.
    • + *
    * *

    Notes

    *

    @@ -133,51 +131,57 @@ import java.util.*; * the input callset. *

    * - *

    Examples

    + *

    Usage examples

    + *

    Inform the genotype assignment of NA12878 using the 1000G Euro panel

    *
    - * Inform the genotype assignment of NA12878 using the 1000G Euro panel
    - * java -Xmx2g -jar GenomeAnalysisTK.jar \
    - *   -R ref.fasta \
    + * java -jar GenomeAnalysisTK.jar \
      *   -T CalculateGenotypePosteriors \
    + *   -R reference.fasta \
      *   -V NA12878.wgs.HC.vcf \
      *   -supporting 1000G_EUR.genotypes.combined.vcf \
      *   -o NA12878.wgs.HC.posteriors.vcf \
      *
    - * Refine the genotypes of a large panel based on the discovered allele frequency
    - * java -Xmx2g -jar GenomeAnalysisTK.jar \
    - *   -R ref.fasta \
    + * 

    Refine the genotypes of a large panel based on the discovered allele frequency

    + *
    + * java -jar GenomeAnalysisTK.jar \
      *   -T CalculateGenotypePosteriors \
    + *   -R reference.fasta \
      *   -V input.vcf \
      *   -o output.withPosteriors.vcf
    + * 
    * - * Apply frequency and HWE-based priors to the genotypes of a family without including the family allele counts - * in the allele frequency estimates - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *

    Apply frequency and HWE-based priors to the genotypes of a family without including the family allele counts + * in the allele frequency estimates the genotypes of a large panel based on the discovered allele frequency

    + *
    + * java -jar GenomeAnalysisTK.jar \
      *   -T CalculateGenotypePosteriors \
    + *   -R reference.fasta \
      *   -V input.vcf \
      *   -o output.withPosteriors.vcf \
      *   --ignoreInputSamples
    + * 
    * - * Calculate the posterior genotypes of a callset, and impose that a variant *not seen* in the external panel - * is tantamount to being AC=0, AN=100 within that panel - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ + *

    Calculate the posterior genotypes of a callset, and impose that a variant *not seen* in the external panel + * is tantamount to being AC=0, AN=100 within that panel

    + *
    + * java -jar GenomeAnalysisTK.jar \
      *   -T CalculateGenotypePosteriors \
    + *   -R reference.fasta \
      *   -supporting external.panel.vcf \
      *   -V input.vcf \
    - *   -o output.withPosteriors.vcf
    + *   -o output.withPosteriors.vcf \
      *   --numRefSamplesIfNoCall 100
    - *   
    - * Apply only family priors to a callset
    - * java -Xmx2g -jar GenomeAnalysisTK.jar \
    - *   -R ref.fasta \
    - *   -T CalculateGenotypePosteriors \
    - *   -V input.vcf \
    - *   --skipPopulationPriors
    - *   -ped family.ped
    - *   -o output.withPosteriors.vcf 
    + * 
    * + *

    Apply only family priors to a callset

    + *
    + * java -jar GenomeAnalysisTK.jar \
    + *   -T CalculateGenotypePosteriors \
    + *   -R reference.fasta \
    + *   -V input.vcf \
    + *   --skipPopulationPriors \
    + *   -ped family.ped \
    + *   -o output.withPosteriors.vcf
      * 
    * */ @@ -195,7 +199,7 @@ public class CalculateGenotypePosteriors extends RodWalker { * be used to inform the frequency distribution underying the genotype priors. */ @Input(fullName="supporting", shortName = "supporting", doc="Other callsets to use in generating genotype posteriors", required=false) - public List> supportVariants = new ArrayList>(); + public List> supportVariants = new ArrayList<>(); /** * The global prior of a variant site -- i.e. the expected allele frequency distribution knowing only that N alleles @@ -262,10 +266,6 @@ public class CalculateGenotypePosteriors extends RodWalker { @Output(doc="File to which variants should be written") protected VariantContextWriter vcfWriter = null; - private final String JOINT_LIKELIHOOD_TAG_NAME = "JL"; - private final String JOINT_POSTERIOR_TAG_NAME = "JP"; - private final String PHRED_SCALED_POSTERIORS_KEY = "PP"; - private FamilyLikelihoodsUtils famUtils = new FamilyLikelihoodsUtils(); public void initialize() { @@ -294,8 +294,8 @@ public class CalculateGenotypePosteriors extends RodWalker { throw new UserException("VCF has no genotypes"); } - if ( header.hasInfoLine(VCFConstants.MLE_ALLELE_COUNT_KEY) ) { - final VCFInfoHeaderLine mleLine = header.getInfoHeaderLine(VCFConstants.MLE_ALLELE_COUNT_KEY); + if ( header.hasInfoLine(GATKVCFConstants.MLE_ALLELE_COUNT_KEY) ) { + final VCFInfoHeaderLine mleLine = header.getInfoHeaderLine(GATKVCFConstants.MLE_ALLELE_COUNT_KEY); if ( mleLine.getCountType() != VCFHeaderLineCount.A ) { throw new UserException("VCF does not have a properly formatted MLEAC field: the count type should be \"A\""); } @@ -307,11 +307,11 @@ public class CalculateGenotypePosteriors extends RodWalker { // Initialize VCF header final Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); - headerLines.add(new VCFFormatHeaderLine(PHRED_SCALED_POSTERIORS_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Phred-scaled Posterior Genotype Probabilities")); - headerLines.add(new VCFInfoHeaderLine("PG", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Genotype Likelihood Prior")); + headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY)); + headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.GENOTYPE_PRIOR_KEY)); if (!skipFamilyPriors) { - headerLines.add(new VCFFormatHeaderLine(JOINT_LIKELIHOOD_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred-scaled joint likelihood of the genotype combination (before applying family priors)")); - headerLines.add(new VCFFormatHeaderLine(JOINT_POSTERIOR_TAG_NAME, 1, VCFHeaderLineType.Integer, "Phred-scaled joint posterior probability of the genotype combination (after applying family priors)")); + headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.JOINT_LIKELIHOOD_TAG_NAME)); + headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.JOINT_POSTERIOR_TAG_NAME)); } headerLines.add(new VCFHeaderLine("source", "CalculateGenotypePosteriors")); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java index d6b2e11e2..fffd52e4f 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFs.java @@ -53,59 +53,60 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.Reference; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.engine.walkers.Window; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.*; -import org.broadinstitute.gatk.utils.variant.ReferenceConfidenceVariantContextMerger; import java.util.*; /** - * Combines any number of gVCF files that were produced by the Haplotype Caller into a single joint gVCF file. + * Combine per-sample gVCF files produced by HaplotypeCaller into a multi-sample gVCF file * *

    * CombineGVCFs is meant to be used for hierarchical merging of gVCFs that will eventually be input into GenotypeGVCFs. * One would use this tool when needing to genotype too large a number of individual gVCFs; instead of passing them * all in to GenotypeGVCFs, one would first use CombineGVCFs on smaller batches of samples and then pass these combined - * gVCFs to GenotypeGVCFs. - * - * Note that this tool cannot work with just any gVCF files - they must have been produced with the Haplotype Caller - * as part of the "single sample discovery" pipeline using the '-ERC GVCF' mode, which uses a sophisticated reference - * model to produce accurate genotype likelihoods for every position in the target. + * gVCFs to GenotypeGVCFs.

    * *

    Input

    *

    - * One or more Haplotype Caller gVCFs to combine. + * Two or more Haplotype Caller gVCFs to combine. *

    * *

    Output

    *

    - * A combined VCF. + * A combined multisample gVCF. *

    * - *

    Examples

    + *

    Usage example

    *
    - * java -Xmx2g -jar GenomeAnalysisTK.jar \
    - *   -R ref.fasta \
    + * java -jar GenomeAnalysisTK.jar \
      *   -T CombineGVCFs \
    - *   --variant gvcf1.vcf \
    - *   --variant gvcf2.vcf \
    - *   -o mergeGvcf.vcf
    + *   -R reference.fasta \
    + *   --variant sample1.g.vcf \
    + *   --variant sample2.g.vcf \
    + *   -o cohort.g.vcf
      * 
    * + *

    Caveat

    + *

    Only gVCF files produced by HaplotypeCaller (or CombineGVCFs) can be used as input for this tool. Some other + * programs produce files that they call gVCFs but those lack some important information (accurate genotype likelihoods + * for every position) that GenotypeGVCFs requires for its operation.

    + * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) @Reference(window=@Window(start=0,stop=1)) @@ -113,10 +114,14 @@ public class CombineGVCFs extends RodWalker VCs; + final Set samples = new HashSet<>(); final byte[] refBases; final GenomeLoc loc; public PositionalState(final List VCs, final byte[] refBases, final GenomeLoc loc) { this.VCs = VCs; + for(final VariantContext vc : VCs){ + samples.addAll(vc.getSampleNames()); + } this.refBases = refBases; this.loc = loc; } @@ -124,6 +129,7 @@ public class CombineGVCFs extends RodWalker VCs = new LinkedList<>(); + final Set samples = new HashSet<>(); GenomeLoc prevPos = null; byte refAfterPrevPos; @@ -143,12 +149,24 @@ public class CombineGVCFs extends RodWalker vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit()); final Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); + headerLines.add(new VCFSimpleHeaderLine(GATKVCFConstants.SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG, GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_NAME, "Represents any possible spanning deletion allele at this location")); + headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY)); // needed for gVCFs without DP tags final Set samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); final VCFHeader vcfHeader = new VCFHeader(headerLines, samples); @@ -159,6 +177,10 @@ public class CombineGVCFs extends RodWalker 1 ? startingStates.refBases[1] : (byte)'N'); + if ( breakBand(startingStates.loc) || containsEndingContext(previousState.VCs, startingStates.loc.getStart()) ) { + endPreviousStates(previousState, startingStates.loc, startingStates, true); } return previousState; } + /** + * Should we break bands at the given position? + * + * @param loc the genomic location to evaluate against + * + * @return true if we should ensure that bands should be broken at the given position, false otherwise + */ + private boolean breakBand(final GenomeLoc loc) { + return USE_BP_RESOLUTION || + (loc != null && multipleAtWhichToBreakBands > 0 && (loc.getStart()+1) % multipleAtWhichToBreakBands == 0); // add +1 to the loc because we want to break BEFORE this base + } + /** * Is it okay to skip the given position? * - * @param thisPos this position - * @param lastPosRun the last position for which we created a VariantContext + * @param startingStates state information for this position + * @param previousState state information for the last position for which we created a VariantContext * @return true if it is okay to skip this position, false otherwise */ - private boolean okayToSkipThisSite(final int thisPos, final GenomeLoc lastPosRun) { - return lastPosRun != null && thisPos == lastPosRun.getStart() + 1; + private boolean okayToSkipThisSite(final PositionalState startingStates, final OverallState previousState) { + final int thisPos = startingStates.loc.getStart(); + final GenomeLoc lastPosRun = previousState.prevPos; + Set intersection = new HashSet(startingStates.samples); + intersection.retainAll(previousState.samples); + + //if there's a starting VC with a sample that's already in a current VC, don't skip this position + return lastPosRun != null && thisPos == lastPosRun.getStart() + 1 && intersection.isEmpty(); } /** @@ -235,40 +277,58 @@ public class CombineGVCFs extends RodWalker 1 ? startingStates.refBases[1] : (byte)'N' ): refBase; final List stoppedVCs = new ArrayList<>(state.VCs.size()); for ( int i = state.VCs.size() - 1; i >= 0; i-- ) { final VariantContext vc = state.VCs.get(i); - if ( vc.getStart() <= pos ) { + //the VC for the previous state will be stopped if its position is previous to the current position or it we've moved to a new contig + if ( vc.getStart() <= pos.getStart() || !vc.getChr().equals(pos.getContig())) { stoppedVCs.add(vc); // if it was ending anyways, then remove it from the future state - if ( isEndingContext(vc, pos) ) + if ( vc.getEnd() == pos.getStart()) { + state.samples.removeAll(vc.getSampleNames()); state.VCs.remove(i); + continue; //don't try to remove twice + } + + //if ending vc is the same sample as a starting VC, then remove it from the future state + if(startingStates.VCs.size() > 0 && !atCurrentPosition && startingStates.samples.containsAll(vc.getSampleNames())) { + state.samples.removeAll(vc.getSampleNames()); + state.VCs.remove(i); + } } } - if ( !stoppedVCs.isEmpty() ) { - final GenomeLoc gLoc = genomeLocParser.createGenomeLoc(stoppedVCs.get(0).getChr(), pos); + //output the stopped VCs if there is no previous output (state.prevPos == null) or our current position is past + // the last write position (state.prevPos) + //NOTE: BP resolution with have current position == state.prevPos because it gets output via a different control flow + if ( !stoppedVCs.isEmpty() && (state.prevPos == null || pos.isPast(state.prevPos) )) { + final GenomeLoc gLoc = genomeLocParser.createGenomeLoc(stoppedVCs.get(0).getChr(), pos.getStart()); // we need the specialized merge if the site contains anything other than ref blocks final VariantContext mergedVC; if ( containsTrueAltAllele(stoppedVCs) ) - mergedVC = ReferenceConfidenceVariantContextMerger.merge(stoppedVCs, gLoc, refBase, false); + mergedVC = ReferenceConfidenceVariantContextMerger.merge(stoppedVCs, gLoc, refBase, false, false); else - mergedVC = referenceBlockMerge(stoppedVCs, state, pos); + mergedVC = referenceBlockMerge(stoppedVCs, state, pos.getStart()); vcfWriter.add(mergedVC); state.prevPos = gLoc; - state.refAfterPrevPos = refBase; + state.refAfterPrevPos = refNextBase; } } @@ -308,7 +368,7 @@ public class CombineGVCFs extends RodWalker>> mvCountMatrix = - new EnumMap>>(GenotypeType.class); + new EnumMap<>(GenotypeType.class); final int NUM_CALLED_GENOTYPETYPES = 3; //HOM_REF, HET, and HOM_VAR double[] configurationLikelihoodsMatrix = new double[NUM_CALLED_GENOTYPETYPES*NUM_CALLED_GENOTYPETYPES*NUM_CALLED_GENOTYPETYPES]; - ArrayList trios = new ArrayList(); - - private final String JOINT_LIKELIHOOD_TAG_NAME = "JL"; - private final String JOINT_POSTERIOR_TAG_NAME = "JP"; - private final String PHRED_SCALED_POSTERIORS_KEY = "PP"; + ArrayList trios = new ArrayList<>(); public final double NO_JOINT_VALUE = -1.0; @@ -107,9 +104,9 @@ public class FamilyLikelihoodsUtils { */ public void getUpdatedGenotypes(final VariantContext vc, final Genotype motherGenotype, final Genotype fatherGenotype, final Genotype childGenotype, final ArrayList updatedGenotypes){ //genotypes here can be no call - boolean fatherIsCalled = fatherGenotype != null && hasCalledGT(fatherGenotype.getType()); - boolean motherIsCalled = motherGenotype != null && hasCalledGT(motherGenotype.getType()); - boolean childIsCalled = childGenotype != null && hasCalledGT(childGenotype.getType()); + boolean fatherIsCalled = fatherGenotype != null && hasCalledGT(fatherGenotype.getType()) && fatherGenotype.hasLikelihoods(); + boolean motherIsCalled = motherGenotype != null && hasCalledGT(motherGenotype.getType()) && motherGenotype.hasLikelihoods(); + boolean childIsCalled = childGenotype != null && hasCalledGT(childGenotype.getType()) && childGenotype.hasLikelihoods(); //default to posteriors equal to likelihoods (flat priors) in case input genotypes are not called double[] uninformativeLikelihoods = {ONE_THIRD, ONE_THIRD, ONE_THIRD}; @@ -158,10 +155,10 @@ public class FamilyLikelihoodsUtils { } //Add the joint trio calculations - final Map genotypeAttributes = new HashMap(); + final Map genotypeAttributes = new HashMap<>(); genotypeAttributes.putAll(genotype.getExtendedAttributes()); - genotypeAttributes.put(JOINT_LIKELIHOOD_TAG_NAME, phredScaledJL); - genotypeAttributes.put(JOINT_POSTERIOR_TAG_NAME, phredScaledJP); + genotypeAttributes.put(GATKVCFConstants.JOINT_LIKELIHOOD_TAG_NAME, phredScaledJL); + genotypeAttributes.put(GATKVCFConstants.JOINT_POSTERIOR_TAG_NAME, phredScaledJP); final GenotypeBuilder builder = new GenotypeBuilder(genotype); @@ -171,7 +168,7 @@ public class FamilyLikelihoodsUtils { GATKVariantContextUtils.updateGenotypeAfterSubsetting(vc.getAlleles(), builder, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, log10Posteriors, vc.getAlleles()); - builder.attribute(PHRED_SCALED_POSTERIORS_KEY, + builder.attribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY, Utils.listFromPrimitives(GenotypeLikelihoods.fromLog10Likelihoods(log10Posteriors).getAsPLs())); builder.attributes(genotypeAttributes); return builder.make(); @@ -231,7 +228,7 @@ public class FamilyLikelihoodsUtils { } break; default: - throw new UserException(String.format("%d does not indicate a valid trio FamilyMember -- use 0 for mother, 1 for father, 2 for child",recalcInd)); + throw new UserException(String.format("%d does not indicate a valid trio FamilyMember -- use 0 for mother, 1 for father, 2 for child",recalcInd.ordinal())); } recalcPosteriors[0] = MathUtils.log10sumLog10(marginalOverChangedHR,0); @@ -262,7 +259,7 @@ public class FamilyLikelihoodsUtils { continue; } - final ArrayList trioGenotypes = new ArrayList(3); + final ArrayList trioGenotypes = new ArrayList<>(3); updateFamilyGenotypes(vc, mother, father, child, trioGenotypes); //replace uses sample names to match genotypes, so order doesn't matter @@ -282,12 +279,12 @@ public class FamilyLikelihoodsUtils { private ArrayList setTrios(Set vcfSamples, Map> families){ Set family; ArrayList parents; - final ArrayList trios = new ArrayList(); + final ArrayList trios = new ArrayList<>(); for(final Map.Entry> familyEntry : families.entrySet()){ family = familyEntry.getValue(); // Since getFamilies(vcfSamples) above still returns parents of samples in the VCF even if those parents are not in the VCF, need to subset down here: - final Set familyMembersInVCF = new TreeSet(); + final Set familyMembersInVCF = new TreeSet<>(); for(final Sample familyMember : family){ if (vcfSamples.contains(familyMember.getID())) { familyMembersInVCF.add(familyMember); @@ -298,7 +295,7 @@ public class FamilyLikelihoodsUtils { if(family.size() == 3){ for(final Sample familyMember : family){ parents = familyMember.getParents(); - if(parents.size()>0){ + if(parents.size()==2){ if(family.containsAll(parents)) trios.add(familyMember); } @@ -331,7 +328,7 @@ public class FamilyLikelihoodsUtils { if(child == GenotypeType.NO_CALL || child == GenotypeType.UNAVAILABLE) return 0; //Add parents with genotypes for the evaluation - final ArrayList parents = new ArrayList(); + final ArrayList parents = new ArrayList<>(); if (!(mother == GenotypeType.NO_CALL || mother == GenotypeType.UNAVAILABLE)) parents.add(mother); if (!(father == GenotypeType.NO_CALL || father == GenotypeType.UNAVAILABLE)) @@ -426,11 +423,11 @@ public class FamilyLikelihoodsUtils { //Get a Map of genotype (log10)likelihoods private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){ - final EnumMap likelihoodsMap = new EnumMap(GenotypeType.class); + final EnumMap likelihoodsMap = new EnumMap<>(GenotypeType.class); double[] likelihoods; - if (genotype != null && hasCalledGT(genotype.getType()) && genotype.hasExtendedAttribute(PHRED_SCALED_POSTERIORS_KEY)) { - Object GPfromVCF = genotype.getExtendedAttribute(PHRED_SCALED_POSTERIORS_KEY); + if (genotype != null && hasCalledGT(genotype.getType()) && genotype.hasExtendedAttribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY)) { + Object GPfromVCF = genotype.getExtendedAttribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY); //parse the GPs into a vector of probabilities final String[] likelihoodsAsStringVector = ((String)GPfromVCF).split(","); final double[] likelihoodsAsVector = new double[likelihoodsAsStringVector.length]; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java index 2fdc0f5af..9e71a0818 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java @@ -58,9 +58,12 @@ import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection; import org.broadinstitute.gatk.engine.arguments.GenotypeCalculationArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.genotyper.IndexedSampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleListUtils; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.Reference; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.engine.walkers.TreeReducible; @@ -69,35 +72,32 @@ import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.genotyper.*; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.GeneralPloidyFailOverAFCalculatorProvider; -import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCaller; import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.gatk.utils.variant.ReferenceConfidenceVariantContextMerger; import java.util.*; /** - * Genotypes any number of gVCF files that were produced by the Haplotype Caller into a single joint VCF file. + * Perform joint genotyping on gVCF files produced by HaplotypeCaller * *

    - * GenotypeGVCFs merges gVCF records that were produced as part of the reference model-based variant discovery pipeline (see documentation for more details) using - * the '-ERC GVCF' or '-ERC BP_RESOLUTION' mode of the HaplotypeCaller. This tool performs the multi-sample joint aggregation - * step and merges the records together in a sophisticated manner. - * - * At all positions of the target, this tool will combine all spanning records, produce correct genotype likelihoods, - * re-genotype the newly merged record, and then re-annotate it. - * - * Note that this tool cannot work with just any gVCF files - they must have been produced with the HaplotypeCaller, - * which uses a sophisticated reference model to produce accurate genotype likelihoods for every position in the target. + * GenotypeGVCFs merges gVCF records that were produced as part of the Best Practices workflow for variant discovery + * (see Best Practices documentation for more details) using the '-ERC GVCF' or '-ERC BP_RESOLUTION' mode of the + * HaplotypeCaller, or result from combining such gVCF files using CombineGVCFs. This tool performs the multi-sample + * joint aggregation step and merges the records together in a sophisticated manner: at each position of the input + * gVCFs, this tool will combine all spanning records, produce correct genotype likelihoods, re-genotype the newly + * merged record, and then re-annotate it.

    * *

    Input

    *

    - * One or more Haplotype Caller gVCFs to genotype. + * One or more HaplotypeCaller gVCFs to genotype. *

    * *

    Output

    @@ -105,16 +105,25 @@ import java.util.*; * A combined, genotyped VCF. *

    * - *

    Examples

    + *

    Usage example

    *
    - * java -Xmx2g -jar GenomeAnalysisTK.jar \
    - *   -R ref.fasta \
    + * java -jar GenomeAnalysisTK.jar \
      *   -T GenotypeGVCFs \
    - *   --variant gvcf1.vcf \
    - *   --variant gvcf2.vcf \
    + *   -R reference.fasta \
    + *   --variant sample1.g.vcf \
    + *   --variant sample2.g.vcf \
      *   -o output.vcf
      * 
    * + *

    Caveat

    + *

    Only gVCF files produced by HaplotypeCaller (or CombineGVCFs) can be used as input for this tool. Some other + * programs produce files that they call gVCFs but those lack some important information (accurate genotype likelihoods + * for every position) that GenotypeGVCFs requires for its operation.

    + * + *

    Special note on ploidy

    + *

    This tool is able to handle any ploidy (or mix of ploidies) intelligently; there is no need to specify ploidy + * for non-diploid organisms.

    + * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) @Reference(window=@Window(start=-10,stop=10)) @@ -134,7 +143,15 @@ public class GenotypeGVCFs extends RodWalker annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"InbreedingCoeff", "FisherStrand", "QualByDepth", "ChromosomeCounts", "GenotypeSummaries", "StrandOddsRatio"})); + protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"InbreedingCoeff", "FisherStrand", "QualByDepth", "ChromosomeCounts", "StrandOddsRatio"})); /** * The rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. Note that dbSNP is not used in any way for the calculations themselves. @@ -163,13 +180,33 @@ public class GenotypeGVCFs extends RodWalker variantCollection : variantCollections ) + for ( final RodBindingCollection variantCollection : variantCollections ) { variants.addAll(variantCollection.getRodBindings()); + if (uniquifySamples) { + for (final RodBinding rb : variantCollection.getRodBindings()) { + //are inputs passed in with -V:fileTag ? + if (!rb.getTags().isEmpty()) inputsAreTagged = true; + } + } + } + //RodBinding tags are used in sample uniquification + if (inputsAreTagged) + logger.warn("Output uniquified VCF may not be suitable for input to CombineSampleData because input VCF(s) contain tags."); final GenomeAnalysisEngine toolkit = getToolkit(); final Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(toolkit, variants); - final SampleList samples = new IndexedSampleList(SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); + + final GATKVariantContextUtils.GenotypeMergeType mergeType; + if(uniquifySamples) { + mergeType = GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY; + } + else + mergeType = GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE; + + final SampleList samples = new IndexedSampleList(SampleUtils.getSampleList(vcfRods, mergeType)); // create the genotyping engine genotypingEngine = new UnifiedGenotypingEngine(createUAC(), samples, toolkit.getGenomeLocParser(), GeneralPloidyFailOverAFCalculatorProvider.createThreadSafeProvider(toolkit, genotypeArgs, logger), toolkit.getArguments().BAQMode); @@ -180,8 +217,13 @@ public class GenotypeGVCFs extends RodWalker headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); headerLines.addAll(annotationEngine.getVCFAnnotationDescriptions()); headerLines.addAll(genotypingEngine.getAppropriateVCFInfoHeaders()); - // add the pool values for each genotype - VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.MLE_ALLELE_COUNT_KEY, VCFConstants.MLE_ALLELE_FREQUENCY_KEY); + + // add headers for annotations added by this tool + headerLines.add(new VCFSimpleHeaderLine(GATKVCFConstants.SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG, GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_NAME, "Represents any possible spanning deletion allele at this location")); + headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_COUNT_KEY)); + headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY)); + headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.REFERENCE_GENOTYPE_QUALITY)); + headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY)); // needed for gVCFs without DP tags if ( dbsnp != null && dbsnp.dbsnp.isBound() ) VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.DBSNP_KEY); @@ -197,7 +239,7 @@ public class GenotypeGVCFs extends RodWalker originalAttributes, final VariantContext newVC) { // we want to carry forward the attributes from the original VC but make sure to add the MLE-based annotations final Map attrs = new HashMap<>(originalAttributes); - attrs.put(VCFConstants.MLE_ALLELE_COUNT_KEY, newVC.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY)); - attrs.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, newVC.getAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY)); - if (newVC.hasAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY)) - attrs.put(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, newVC.getAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY)); + attrs.put(GATKVCFConstants.MLE_ALLELE_COUNT_KEY, newVC.getAttribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY)); + attrs.put(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY, newVC.getAttribute(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY)); + if (newVC.hasAttribute(GATKVCFConstants.NUMBER_OF_DISCOVERED_ALLELES_KEY)) + attrs.put(GATKVCFConstants.NUMBER_OF_DISCOVERED_ALLELES_KEY, newVC.getAttribute(GATKVCFConstants.NUMBER_OF_DISCOVERED_ALLELES_KEY)); return new VariantContextBuilder(newVC).attributes(attrs).make(); } @@ -278,6 +330,7 @@ public class GenotypeGVCFs extends RodWalker resources, final int numRefSamplesFromMissingResources, @@ -109,12 +108,12 @@ public class PosteriorLikelihoodsUtils { //parse the likelihoods for each sample's genotype final List likelihoods = new ArrayList<>(vc1.getNSamples()); for ( final Genotype genotype : vc1.getGenotypes() ) { - if (!genotype.hasExtendedAttribute(PHRED_SCALED_POSTERIORS_KEY)){ + if (!genotype.hasExtendedAttribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY)){ likelihoods.add(genotype.hasLikelihoods() ? genotype.getLikelihoods().getAsVector() : null ); } else { - Object PPfromVCF = genotype.getExtendedAttribute(PHRED_SCALED_POSTERIORS_KEY); + Object PPfromVCF = genotype.getExtendedAttribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY); //parse the PPs into a vector of probabilities if (PPfromVCF instanceof String) { final String PPstring = (String)PPfromVCF; @@ -153,7 +152,7 @@ public class PosteriorLikelihoodsUtils { if ( posteriors.get(genoIdx) != null ) { GATKVariantContextUtils.updateGenotypeAfterSubsetting(vc1.getAlleles(), builder, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, posteriors.get(genoIdx), vc1.getAlleles()); - builder.attribute(PHRED_SCALED_POSTERIORS_KEY, + builder.attribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY, Utils.listFromPrimitives(GenotypeLikelihoods.fromLog10Likelihoods(posteriors.get(genoIdx)).getAsPLs())); } newContext.add(builder.make()); @@ -162,7 +161,7 @@ public class PosteriorLikelihoodsUtils { final List priors = Utils.listFromPrimitives( GenotypeLikelihoods.fromLog10Likelihoods(getDirichletPrior(alleleCounts, vc1.getMaxPloidy(2),useFlatPriors)).getAsPLs()); - final VariantContextBuilder builder = new VariantContextBuilder(vc1).genotypes(newContext).attribute("PG", priors); + final VariantContextBuilder builder = new VariantContextBuilder(vc1).genotypes(newContext).attribute(GATKVCFConstants.GENOTYPE_PRIOR_KEY, priors); // add in the AC, AF, and AN attributes VariantContextUtils.calculateChromosomeCounts(builder, true); return builder.make(); @@ -266,8 +265,8 @@ public class PosteriorLikelihoodsUtils { private static void addAlleleCounts(final Map counts, final VariantContext context, final boolean useAC) { final int[] ac; //use MLEAC value... - if ( context.hasAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY) && ! useAC ) { - ac = getAlleleCounts(VCFConstants.MLE_ALLELE_COUNT_KEY, context); + if ( context.hasAttribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY) && ! useAC ) { + ac = getAlleleCounts(GATKVCFConstants.MLE_ALLELE_COUNT_KEY, context); } //...unless specified by the user in useAC or unless MLEAC is absent else if ( context.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) { @@ -346,7 +345,7 @@ public class PosteriorLikelihoodsUtils { } if ( mleList == null ) throw new IllegalArgumentException(String.format("VCF does not have properly formatted "+ - VCFConstants.MLE_ALLELE_COUNT_KEY+" or "+VCFConstants.ALLELE_COUNT_KEY)); + GATKVCFConstants.MLE_ALLELE_COUNT_KEY+" or "+VCFConstants.ALLELE_COUNT_KEY)); final int[] mle = new int[mleList.size()]; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java new file mode 100644 index 000000000..d16e86eb3 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java @@ -0,0 +1,464 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.variantutils; + +import htsjdk.variant.variantcontext.*; +import htsjdk.variant.vcf.VCFConstants; +import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculators; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; + +import java.util.*; + +/** + * Variant context utilities related to merging variant-context instances. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class ReferenceConfidenceVariantContextMerger { + + private static Comparable combineAnnotationValues( final List array ) { + return MathUtils.median(array); // right now we take the median but other options could be explored + } + + /** + * Merges VariantContexts from gVCFs into a single hybrid. + * Assumes that none of the input records are filtered. + * + * @param VCs collection of unsorted genomic VCs + * @param loc the current location + * @param refBase the reference allele to use if all contexts in the VC are spanning (i.e. don't start at the location in loc); if null, we'll return null in this case + * @param removeNonRefSymbolicAllele if true, remove the allele from the merged VC + * @param samplesAreUniquified if true, sample names have been uniquified + * @return new VariantContext representing the merge of all VCs or null if it not relevant + */ + public static VariantContext merge(final List VCs, final GenomeLoc loc, final Byte refBase, final boolean removeNonRefSymbolicAllele, + final boolean samplesAreUniquified) { + // this can happen if e.g. you are using a dbSNP file that spans a region with no gVCFs + if ( VCs == null || VCs.size() == 0 ) + return null; + + // establish the baseline info (sometimes from the first VC) + final VariantContext first = VCs.get(0); + final String name = first.getSource(); + + // ref allele + final Allele refAllele = determineReferenceAlleleGivenReferenceBase(VCs, loc, refBase); + if ( refAllele == null ) + return null; + + // FinalAlleleSet contains the alleles of the new resulting VC + // Using linked set in order to guarantee a stable order + final LinkedHashSet finalAlleleSet = new LinkedHashSet<>(10); + // Reference goes first + finalAlleleSet.add(refAllele); + + final Map attributes = new LinkedHashMap<>(); + final Set rsIDs = new LinkedHashSet<>(1); // most of the time there's one id + int depth = 0; + final Map> annotationMap = new LinkedHashMap<>(); + final GenotypesContext genotypes = GenotypesContext.create(); + + // In this list we hold the mapping of each variant context alleles. + final List>> vcAndNewAllelePairs = new ArrayList<>(VCs.size()); + // Keep track of whether we saw a spanning deletion and a non-spanning event + boolean sawSpanningDeletion = false; + boolean sawNonSpanningEvent = false; + + // cycle through and add info from the other VCs + for ( final VariantContext vc : VCs ) { + + // if this context doesn't start at the current location then it must be a spanning event (deletion or ref block) + final boolean isSpanningEvent = loc.getStart() != vc.getStart(); + // record whether it's also a spanning deletion/event (we know this because the VariantContext type is no + // longer "symbolic" but "mixed" because there are real alleles mixed in with the symbolic non-ref allele) + sawSpanningDeletion |= ( isSpanningEvent && vc.isMixed() ) || vc.getAlternateAlleles().contains(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE); + sawNonSpanningEvent |= ( !isSpanningEvent && vc.isMixed() ); + + vcAndNewAllelePairs.add(new Pair<>(vc, isSpanningEvent ? replaceWithNoCallsAndDels(vc) : remapAlleles(vc, refAllele, finalAlleleSet))); + } + + // Add and to the end if at all required in in the output. + if ( sawSpanningDeletion && (sawNonSpanningEvent || !removeNonRefSymbolicAllele) ) finalAlleleSet.add(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE); + if (!removeNonRefSymbolicAllele) finalAlleleSet.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); + + final List allelesList = new ArrayList<>(finalAlleleSet); + + for ( final Pair> pair : vcAndNewAllelePairs ) { + final VariantContext vc = pair.getFirst(); + final List remappedAlleles = pair.getSecond(); + + mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList, samplesAreUniquified); + + // special case DP (add it up) for all events + if ( vc.hasAttribute(VCFConstants.DEPTH_KEY) ) { + depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); + } else { // handle the gVCF case from the HaplotypeCaller + for( final Genotype gt : vc.getGenotypes() ) { + depth += (gt.hasExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY) ? Integer.parseInt((String)gt.getAnyAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY)) : (gt.hasDP() ? gt.getDP() : 0)); + } + } + + if ( loc.getStart() != vc.getStart() ) + continue; + + // special case ID (just preserve it) + if ( vc.hasID() ) rsIDs.add(vc.getID()); + + // add attributes + addReferenceConfidenceAttributes(vc.getAttributes(), annotationMap); + } + + // when combining annotations use the median value from all input VCs which had annotations provided + for ( final Map.Entry> p : annotationMap.entrySet() ) { + if ( ! p.getValue().isEmpty() ) { + attributes.put(p.getKey(), combineAnnotationValues(p.getValue())); + } + } + + if ( depth > 0 ) + attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); + + // remove stale AC and AF based attributes + removeStaleAttributesAfterMerge(attributes); + + final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); + + // note that in order to calculate the end position, we need a list of alleles that doesn't include anything symbolic + final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID).alleles(allelesList) + .chr(loc.getContig()).start(loc.getStart()).computeEndFromAlleles(nonSymbolicAlleles(allelesList), loc.getStart(), loc.getStart()) + .genotypes(genotypes).unfiltered().attributes(new TreeMap<>(attributes)).log10PError(CommonInfo.NO_LOG10_PERROR); // we will need to re-genotype later + + return builder.make(); + } + + /** + * @param list the original alleles list + * @return a non-null list of non-symbolic alleles + */ + private static List nonSymbolicAlleles(final List list) { + final List result = new ArrayList<>(list.size()); + for ( final Allele allele : list ) { + if ( !allele.isSymbolic() ) + result.add(allele); + } + return result; + } + + /** + * Determines the ref allele given the provided reference base at this position + * + * @param VCs collection of unsorted genomic VCs + * @param loc the current location + * @param refBase the reference allele to use if all contexts in the VC are spanning + * @return new Allele or null if no reference allele/base is available + */ + private static Allele determineReferenceAlleleGivenReferenceBase(final List VCs, final GenomeLoc loc, final Byte refBase) { + final Allele refAllele = GATKVariantContextUtils.determineReferenceAllele(VCs, loc); + if ( refAllele == null ) + return ( refBase == null ? null : Allele.create(refBase, true) ); + return refAllele; + } + + /** + * Remove the stale attributes from the merged set + * + * @param attributes the attribute map + */ + private static void removeStaleAttributesAfterMerge(final Map attributes) { + attributes.remove(VCFConstants.ALLELE_COUNT_KEY); + attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY); + attributes.remove(VCFConstants.ALLELE_NUMBER_KEY); + attributes.remove(GATKVCFConstants.MLE_ALLELE_COUNT_KEY); + attributes.remove(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY); + attributes.remove(VCFConstants.END_KEY); + } + + /** + * Adds attributes to the global map from the new context in a sophisticated manner + * + * @param myAttributes attributes to add from + * @param annotationMap map of annotations for combining later + */ + private static void addReferenceConfidenceAttributes(final Map myAttributes, + final Map> annotationMap) { + for ( final Map.Entry p : myAttributes.entrySet() ) { + final String key = p.getKey(); + final Object value = p.getValue(); + + // add the annotation values to a list for combining later + List values = annotationMap.get(key); + if( values == null ) { + values = new ArrayList<>(); + annotationMap.put(key, values); + } + try { + final String stringValue = value.toString(); + // Branch to avoid unintentional, implicit type conversions that occur with the ? operator. + if (stringValue.contains(".")) + values.add(Double.parseDouble(stringValue)); + else + values.add(Integer.parseInt(stringValue)); + } catch (final NumberFormatException e) { + // nothing to do + } + } + } + + /** + * This method does a couple of things: + *
    • + * remaps the vc alleles considering the differences between the final reference allele and its own reference,
    • + *
    • + * collects alternative alleles present in variant context and add them to the {@code finalAlleles} set. + *
    + * + * @param vc the variant context. + * @param refAllele final reference allele. + * @param finalAlleles where to add the final set of non-ref called alleles. + * @return never {@code null} + */ + //TODO as part of a larger refactoring effort {@link #remapAlleles} can be merged with {@link GATKVariantContextUtils#remapAlleles}. + private static List remapAlleles(final VariantContext vc, final Allele refAllele, final LinkedHashSet finalAlleles) { + + final Allele vcRef = vc.getReference(); + final byte[] refBases = refAllele.getBases(); + final int extraBaseCount = refBases.length - vcRef.getBases().length; + if (extraBaseCount < 0) throw new IllegalStateException("the wrong reference was selected"); + + final List result = new ArrayList<>(vc.getNAlleles()); + result.add(refAllele); + + for (final Allele a : vc.getAlternateAlleles()) { + if (a.isSymbolic()) { + result.add(a); + // we always skip when adding to finalAlleles; this is done outside if applies. + // we also skip <*DEL> if there isn't a real alternate allele. + if ( !a.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) && !vc.isSymbolic() ) + finalAlleles.add(a); + } else if (a.isCalled()) { + final Allele newAllele; + if (extraBaseCount > 0) { + final byte[] oldBases = a.getBases(); + final byte[] newBases = Arrays.copyOf(oldBases,oldBases.length + extraBaseCount); + System.arraycopy(refBases,refBases.length - extraBaseCount,newBases,oldBases.length,extraBaseCount); + newAllele = Allele.create(newBases,false); + } else + newAllele = a; + result.add(newAllele); + finalAlleles.add(newAllele); + } else { // NO_CALL and strange miscellanea + result.add(a); + } + } + return result; + } + + /** + * Replaces any alleles in the VariantContext with NO CALLS or the symbolic deletion allele as appropriate, except for the generic ALT allele + * + * @param vc VariantContext with the alleles to replace + * @return non-null list of alleles + */ + private static List replaceWithNoCallsAndDels(final VariantContext vc) { + if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null"); + + final List result = new ArrayList<>(vc.getNAlleles()); + + // no-call the reference allele + result.add(Allele.NO_CALL); + + // handle the alternate alleles + for ( final Allele allele : vc.getAlternateAlleles() ) { + final Allele replacement; + if ( allele.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) ) + replacement = allele; + else if ( allele.length() < vc.getReference().length() ) + replacement = GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE; + else + replacement = Allele.NO_CALL; + + result.add(replacement); + } + return result; + } + + /** + * Merge into the context a new genotype represented by the given VariantContext for the provided list of target alleles. + * This method assumes that none of the alleles in the VC overlaps with any of the alleles in the set. + * + * @param mergedGenotypes the genotypes context to add to + * @param VC the Variant Context for the sample + * @param remappedAlleles the list of remapped alleles for the sample + * @param targetAlleles the list of target alleles + * @param samplesAreUniquified true if sample names have been uniquified + */ + private static void mergeRefConfidenceGenotypes(final GenotypesContext mergedGenotypes, + final VariantContext VC, + final List remappedAlleles, + final List targetAlleles, + final boolean samplesAreUniquified) { + final int maximumPloidy = VC.getMaxPloidy(GATKVariantContextUtils.DEFAULT_PLOIDY); + // the map is different depending on the ploidy, so in order to keep this method flexible (mixed ploidies) + // we need to get a map done (lazily inside the loop) for each ploidy, up to the maximum possible. + final int[][] genotypeIndexMapsByPloidy = new int[maximumPloidy + 1][]; + final int maximumAlleleCount = Math.max(remappedAlleles.size(),targetAlleles.size()); + int[] perSampleIndexesOfRelevantAlleles; + + for ( final Genotype g : VC.getGenotypes() ) { + final String name; + if (samplesAreUniquified) + name = g.getSampleName() + "." + VC.getSource(); + else + name = g.getSampleName(); + final int ploidy = g.getPloidy(); + final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy())); + genotypeBuilder.name(name); + if (g.hasPL()) { + // lazy initialization of the genotype index map by ploidy. + perSampleIndexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, VC.getStart(), g); + final int[] genotypeIndexMapByPloidy = genotypeIndexMapsByPloidy[ploidy] == null + ? GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(perSampleIndexesOfRelevantAlleles) + : genotypeIndexMapsByPloidy[ploidy]; + final int[] PLs = generatePL(g, genotypeIndexMapByPloidy); + final int[] AD = g.hasAD() ? generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles) : null; + genotypeBuilder.PL(PLs).AD(AD); + } + mergedGenotypes.add(genotypeBuilder.make()); + } + } + + /** + * Composes a new likelihood array given the original genotype and the genotype index map. + * + * @param g the original genotype. + * @param genotypeIndexMapByPloidy genotype index map. The ith element indicates what genotype in {@code g} corresponds + * to the ith genotype in the return likelihoods array. + * + * @throws NullPointerException if {@code g} or {@code genotypeIndexMapByPloidy} is {@code null}, or if {@code g} + * does not contain likelihoods. + * @throws IndexOutOfBoundsException if {@code genotypeIndexMapByPloidy} contain non valid + * genotype indices given the likelihood array in {@code g}. + * + * @return never {@code null} but an array of exactly {@code genotypeIndexMapByPloidy.length} positions. + */ + private static int[] generatePL(final Genotype g, final int[] genotypeIndexMapByPloidy) { + final int[] PLs = new int[genotypeIndexMapByPloidy.length]; + final int[] oldPLs = g.getPL(); + for (int i = 0; i < PLs.length; i++) + PLs[i] = oldPLs[genotypeIndexMapByPloidy[i]]; + return PLs; + } + + /** + * Determines the allele mapping from myAlleles to the targetAlleles, substituting the generic "" as appropriate. + * If the myAlleles set does not contain "" as an allele, it throws an exception. + * + * @param remappedAlleles the list of alleles to evaluate + * @param targetAlleles the target list of alleles + * @param position position to output error info + * @param g genotype from which targetAlleles are derived + * @return non-null array of ints representing indexes + */ + protected static int[] getIndexesOfRelevantAlleles(final List remappedAlleles, final List targetAlleles, final int position, final Genotype g) { + + if ( remappedAlleles == null || remappedAlleles.size() == 0 ) throw new IllegalArgumentException("The list of input alleles must not be null or empty"); + if ( targetAlleles == null || targetAlleles.size() == 0 ) throw new IllegalArgumentException("The list of target alleles must not be null or empty"); + + if ( !remappedAlleles.contains(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) ) + throw new UserException("The list of input alleles must contain " + GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE + " as an allele but that is not the case at position " + position + "; please use the Haplotype Caller with gVCF output to generate appropriate records"); + + final int indexOfNonRef = remappedAlleles.indexOf(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); + final int[] indexMapping = new int[targetAlleles.size()]; + + // the reference likelihoods should always map to each other (even if the alleles don't) + indexMapping[0] = 0; + + // create the index mapping, using the allele whenever such a mapping doesn't exist + for ( int i = 1; i < targetAlleles.size(); i++ ) { + final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i)); + indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele; + } + + return indexMapping; + } + + /** + * Generates a new AD array by adding zeros for missing alleles given the set of indexes of the Genotype's current + * alleles from the original AD. + * + * @param originalAD the original AD to extend + * @param indexesOfRelevantAlleles the indexes of the original alleles corresponding to the new alleles + * @return non-null array of new AD values + */ + protected static int[] generateAD(final int[] originalAD, final int[] indexesOfRelevantAlleles) { + if ( originalAD == null || indexesOfRelevantAlleles == null ) throw new IllegalArgumentException("The list of input AD values and alleles must not be null"); + + final int numADs = indexesOfRelevantAlleles.length; + final int[] newAD = new int[numADs]; + + for ( int i = 0; i < numADs; i++ ) { + final int oldIndex = indexesOfRelevantAlleles[i]; + if ( oldIndex >= originalAD.length ) + newAD[i] = 0; + else + newAD[i] = originalAD[oldIndex]; + } + + return newAD; + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java index 136d3d67e..134f5e514 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java @@ -52,24 +52,24 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedSampleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleListUtils; +import org.broadinstitute.gatk.utils.genotyper.IndexedSampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleListUtils; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.FixedAFCalculatorProvider; import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.engine.walkers.TreeReducible; import org.broadinstitute.gatk.tools.walkers.genotyper.*; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; @@ -84,13 +84,14 @@ import java.util.HashSet; import java.util.Set; /** - * Regenotypes the variants from a VCF. VCF records must contain PLs or GLs. + * Regenotypes the variants from a VCF containing PLs or GLs. * *

    - * This tool triggers re-genotyping of the samples through the Exact Allele Frequency calculation model. Note that this is truly the - * mathematically correct way to select samples from a larger set (especially when calls were generated from low coverage sequencing data); - * using the hard genotypes to select (i.e. the default mode of SelectVariants) can lead to false positives when errors are confused for - * variants in the original genotyping. This functionality used to comprise the --regenotype option in SelectVariants but we pulled it out + * This tool triggers re-genotyping of the samples through the Exact Allele Frequency calculation model. Note that + * this is truly the mathematically correct way to select samples from a larger set (especially when calls were + * generated from low coverage sequencing data); using the hard genotypes to select (i.e. the default mode of + * SelectVariants) can lead to false positives when errors are confused for variants in the original genotyping. + * This functionality used to comprise the --regenotype option in SelectVariants but we pulled it out * into its own tool for technical purposes. * *

    Input

    @@ -103,11 +104,11 @@ import java.util.Set; * A re-genotyped VCF. *

    * - *

    Examples

    + *

    Usage example

    *
    - * java -Xmx2g -jar GenomeAnalysisTK.jar \
    - *   -R ref.fasta \
    + * java -jar GenomeAnalysisTK.jar \
      *   -T RegenotypeVariants \
    + *   -R reference.fasta \
      *   --variant input.vcf \
      *   -o output.vcf
      * 
    diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gga/GenotypingGivenAllelesUtils.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gga/GenotypingGivenAllelesUtils.java index 3d501a71c..134842bcc 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gga/GenotypingGivenAllelesUtils.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gga/GenotypingGivenAllelesUtils.java @@ -53,7 +53,7 @@ package org.broadinstitute.gatk.utils.gga; import org.apache.log4j.Logger; import org.broadinstitute.gatk.utils.commandline.RodBinding; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java index 1ad22c4a3..fd8a98775 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriter.java @@ -57,6 +57,8 @@ import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.*; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import java.util.HashMap; @@ -71,10 +73,6 @@ import java.util.List; * Time: 2:51 PM */ public class GVCFWriter implements VariantContextWriter { - // - // static VCF field names - // - protected final static String MIN_DP_FORMAT_FIELD = "MIN_DP"; // // Final fields initialized in constructor @@ -151,7 +149,7 @@ public class GVCFWriter implements VariantContextWriter { public void writeHeader(VCFHeader header) { if ( header == null ) throw new IllegalArgumentException("header cannot be null"); header.addMetaDataLine(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); - header.addMetaDataLine(new VCFFormatHeaderLine(MIN_DP_FORMAT_FIELD, 1, VCFHeaderLineType.Integer, "Minimum DP observed within the GVCF block")); + header.addMetaDataLine(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.MIN_DP_FORMAT_KEY)); for ( final HomRefBlock partition : GQPartitions ) { header.addMetaDataLine(partition.toVCFHeaderLine()); @@ -245,10 +243,13 @@ public class GVCFWriter implements VariantContextWriter { // create the single Genotype with GQ and DP annotations final GenotypeBuilder gb = new GenotypeBuilder(sampleName, GATKVariantContextUtils.homozygousAlleleList(block.getRef(),block.getPloidy())); gb.noAD().noPL().noAttributes(); // clear all attributes - gb.GQ(block.getMedianGQ()); + + final int[] minPLs = block.getMinPLs(); + gb.PL(minPLs); + final int gq = genotypeQualityFromPLs(minPLs); + gb.GQ(gq); gb.DP(block.getMedianDP()); - gb.attribute(MIN_DP_FORMAT_FIELD, block.getMinDP()); - gb.PL(block.getMinPLs()); + gb.attribute(GATKVCFConstants.MIN_DP_FORMAT_KEY, block.getMinDP()); // This annotation is no longer standard //gb.attribute(MIN_GQ_FORMAT_FIELD, block.getMinGQ()); @@ -256,6 +257,26 @@ public class GVCFWriter implements VariantContextWriter { return vcb.genotypes(gb.make()).make(); } + + private int genotypeQualityFromPLs(final int[] minPLs) { + int first = minPLs[0]; + int second = minPLs[1]; + if (first > second) { + second = first; + first = minPLs[1]; + } + for (int i = 3; i < minPLs.length; i++) { + final int candidate = minPLs[i]; + if (candidate >= second) continue; + if (candidate <= first) { + second = first; + first = candidate; + } else + second = candidate; + } + return second - first; + } + /** * Helper function to create a new HomRefBlock from a variant context and current genotype * @@ -307,7 +328,7 @@ public class GVCFWriter implements VariantContextWriter { } final Genotype g = vc.getGenotype(0); - if ( g.isHomRef() && vc.hasAlternateAllele(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) && vc.isBiallelic() ) { + if ( g.isHomRef() && vc.hasAlternateAllele(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) && vc.isBiallelic() ) { // create bands final VariantContext maybeCompletedBand = addHomRefSite(vc, g); if ( maybeCompletedBand != null ) underlyingWriter.add(maybeCompletedBand); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlock.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlock.java index 8e2308590..6e27a5a63 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlock.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlock.java @@ -187,7 +187,10 @@ final class HomRefBlock { } public VCFHeaderLine toVCFHeaderLine() { - return new VCFHeaderLine("GVCFBlock", "minGQ=" + getGQLowerBound() + "(inclusive),maxGQ=" + getGQUpperBound() + "(exclusive)"); + // Need to uniquify the key for the header line using the min/max GQ, since + // VCFHeader does not allow lines with duplicate keys. + final String key = String.format("GVCFBlock%d-%d", getGQLowerBound(), getGQUpperBound()); + return new VCFHeaderLine(key, "minGQ=" + getGQLowerBound() + "(inclusive),maxGQ=" + getGQUpperBound() + "(exclusive)"); } /** diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculator.java deleted file mode 100644 index 57f823dac..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculator.java +++ /dev/null @@ -1,204 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.haplotype; - -import com.google.java.contract.Requires; -import htsjdk.variant.variantcontext.VariantContext; -import org.broadinstitute.gatk.tools.walkers.genotyper.AlleleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.AlleleListUtils; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleListUtils; -import org.broadinstitute.gatk.tools.walkers.haplotypecaller.PairHMMLikelihoodCalculationEngine; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; - -import java.util.*; - -/** - * Computes the likelihood based probability that haplotypes for first and second variant contexts - * only appear in their fully linked form (x11 and x22) given a set of haplotypes where they might occur - * and read likelihoods per sample - * - * User: depristo - * Date: 3/29/13 - * Time: 9:23 AM - */ -public class HaplotypeLDCalculator { - private final List haplotypes; - private final ReadLikelihoods readLikelihoods; - private List> haplotypeLikelihoodsPerSample = null; - - // linear contigency table with table[0] == [0][0], table[1] = [0][1], table[2] = [1][0], table[3] = [1][1] - private final double[] table = new double[4]; - - /** - * For testing - */ - @SuppressWarnings("unchecked") - protected HaplotypeLDCalculator() { - haplotypes = Collections.emptyList(); - final AlleleList alleleList = AlleleListUtils.emptyList(); - readLikelihoods = new ReadLikelihoods<>(SampleListUtils.emptyList(), - alleleList, Collections.EMPTY_MAP); - } - - public HaplotypeLDCalculator(final List haplotypes, final ReadLikelihoods haplotypeReadMap) { - this.haplotypes = haplotypes; - this.readLikelihoods = haplotypeReadMap; - } - - /** - * Construct the cached list of summed haplotype likelihoods per sample if it - * hasn't already been computed. This data structure is lazy created but only - * needs to be made once when we make 1 merge decision as the data doesn't change - * no matter how many calls to computeProbOfBeingPhased - */ - private void buildHaplotypeLikelihoodsPerSampleIfNecessary() { - if ( haplotypeLikelihoodsPerSample == null ) { - // do the lazy computation - final Set samples = new LinkedHashSet<>(readLikelihoods.samples()); - haplotypeLikelihoodsPerSample = new LinkedList<>(); - for( final String sample : samples ) { - final Map map = new HashMap<>(haplotypes.size()); - for( final Haplotype h : haplotypes ) { - // count up the co-occurrences of the events for the R^2 calculation - final double haplotypeLikelihood = PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, readLikelihoods, Collections.singletonList(h), false)[0][0]; - map.put(h, haplotypeLikelihood); - } - haplotypeLikelihoodsPerSample.add(map); - } - } - } - - /** - * Compute the likelihood based probability that that haplotypes for first and second are only x11 and x22 - * - * As opposed to the hypothesis that all four haplotypes (x11, x12, x21, and x22) exist in the population - * - * @param first a non-null VariantContext - * @param second a non-null VariantContext - * @return the probability that only x11 and x22 exist among the samples - */ - protected double computeProbOfBeingPhased(final VariantContext first, final VariantContext second) { - buildHaplotypeLikelihoodsPerSampleIfNecessary(); - - Arrays.fill(table, Double.NEGATIVE_INFINITY); - - for ( final Map entry : haplotypeLikelihoodsPerSample ) { - for ( final Map.Entry haplotypeLikelihood : entry.entrySet() ) { - final Haplotype h = haplotypeLikelihood.getKey(); - // count up the co-occurrences of the events for the R^2 calculation - final VariantContext thisHapVC = h.getEventMap().get(first.getStart()); - final VariantContext nextHapVC = h.getEventMap().get(second.getStart()); // TODO -- add function to take a VC - final int i = thisHapVC == null ? 0 : 1; - final int j = nextHapVC == null ? 0 : 1; - final int index = 2 * i + j; - table[index] = MathUtils.approximateLog10SumLog10(table[index], haplotypeLikelihood.getValue()); - } - } - - return pPhased(table); - } - - /** - * Compute probability that two variants are in phase with each other and that no - * compound hets exist in the population. - * - * Implemented as a likelihood ratio test of the hypothesis: - * - * x11 and x22 are the only haplotypes in the populations - * - * vs. - * - * all four haplotype combinations (x11, x12, x21, and x22) all exist in the population. - * - * Now, since we have to have both variants in the population, we exclude the x11 & x11 state. So the - * p of having just x11 and x22 is P(x11 & x22) + p(x22 & x22). - * - * Alternatively, we might have any configuration that gives us both 1 and 2 alts, which are: - * - * - P(x11 & x12 & x21) -- we have hom-ref and both hets - * - P(x22 & x12 & x21) -- we have hom-alt and both hets - * - P(x22 & x12) -- one haplotype is 22 and the other is het 12 - * - P(x22 & x21) -- one haplotype is 22 and the other is het 21 - * - * The probability is just p11_22 / (p11_22 + p hets) - * - * @param table linear contigency table with table[0] == [0][0], table[1] = [0][1], table[2] = [1][0], table[3] = [1][1] - * doesn't have to be normalized as this function does the normalization internally - * @return the real space probability that the data is phased - */ - @Requires("table.length == 4") - protected double pPhased( double[] table ) { - final double[] normTable = MathUtils.normalizeFromLog10(table, true); - - final double x11 = normTable[0], x12 = normTable[1], x21 = normTable[2], x22 = normTable[3]; - - // probability that we are only x11 && x22 - final double p11_22 = MathUtils.approximateLog10SumLog10(x11 + x22, x22 + x22); - - // probability of having any of the other pairs - final double p11_12_21 = MathUtils.approximateLog10SumLog10(x11 + x12, x11 + x21, x12 + x21); - final double p22_12_21 = MathUtils.approximateLog10SumLog10(x22 + x12, x22 + x21, x12 + x21); - final double p22_12 = x22 + x12; - final double p22_21 = x22 + x21; - final double pOthers = MathUtils.approximateLog10SumLog10(new double[]{p11_12_21, p22_12_21, p22_12, p22_21}); - - // probability of being phases is the ratio of p11_22 / pOthers which in log space is just a substraction - final double log10phased = p11_22 - (MathUtils.approximateLog10SumLog10(p11_22, pOthers)); - - return Math.pow(10.0, log10phased); - } - - protected double pPhasedTest( final double x11, final double x12, final double x21, final double x22 ) { - return pPhased(new double[]{x11, x12, x21, x22}); - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/LDMerger.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/LDMerger.java deleted file mode 100644 index 53b4cff58..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/LDMerger.java +++ /dev/null @@ -1,313 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.haplotype; - -import org.apache.commons.lang.ArrayUtils; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.GenomeLoc; -import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.VariantContextBuilder; -import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; - -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; -import java.util.TreeSet; - -/** - * Merges VariantContexts in a series of haplotypes according to their pairwise LD - * - * User: depristo - * Date: 3/28/13 - * Time: 6:17 PM - */ -public class LDMerger extends MergeVariantsAcrossHaplotypes { - private final static Logger logger = Logger.getLogger(LDMerger.class); - - private final boolean DEBUG; - private final int minSamplesToMergeSNPs; - private final int minSamplesToMergeOtherEvents; - - public LDMerger(boolean DEBUG, int minSamplesToMergeSNPs, int minSamplesToMergeOtherEvents) { - super(); - this.DEBUG = DEBUG; - this.minSamplesToMergeSNPs = minSamplesToMergeSNPs; - this.minSamplesToMergeOtherEvents = minSamplesToMergeOtherEvents; - } - - protected LDMerger() { - this(false, 1, 1); - } - - // TODO -- should be class arguments and static variables in HC - protected final static int MAX_DISTANCE_BETWEEN_SNPS_TO_MERGE = 6; - protected final static int MAX_DISTANCE_BETWEEN_OTHER_EVENTS_TO_MERGE = 25; - - /** - * We require 99% confidence that only the phased haplotypes exist in the population to merge the records - */ - protected final static double MERGE_EVENTS_PROB_PHASED_THRESHOLD = 0.99; - - /** - * Merge as many events among the haplotypes as possible based on pairwise LD among variants - * - * @param haplotypes a list of haplotypes whose events we want to merge - * @param readLikelihoods map from sample name -> read likelihoods for each haplotype - * @param startPosKeySet a set of starting positions of all events among the haplotypes - * @param ref the reference bases - * @param refLoc the span of the reference bases - */ - @Override - public boolean merge( final List haplotypes, - final ReadLikelihoods readLikelihoods, - final TreeSet startPosKeySet, - final byte[] ref, - final GenomeLoc refLoc ) { - if ( haplotypes == null ) throw new IllegalArgumentException("haplotypes cannot be null"); - if ( readLikelihoods == null ) throw new IllegalArgumentException("readLikelihoods cannot be null"); - if ( startPosKeySet == null ) throw new IllegalArgumentException("startPosKeySet cannot be null"); - if ( ref == null ) throw new IllegalArgumentException("ref cannot be null"); - if ( refLoc == null ) throw new IllegalArgumentException("refLoc cannot be null"); - if ( refLoc.size() != ref.length ) throw new IllegalArgumentException("refLoc size " + refLoc.size() + " != ref.length " + ref.length + " at " + refLoc); - - if( startPosKeySet.size() <= 1 ) { return false; } - - final int nSamples = readLikelihoods.sampleCount(); - final HaplotypeLDCalculator r2Calculator = new HaplotypeLDCalculator(haplotypes, readLikelihoods); - boolean somethingWasMerged = false; - boolean mapWasUpdated = true; - while( mapWasUpdated ) { - mapWasUpdated = mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calculator, nSamples, startPosKeySet, ref, refLoc); - somethingWasMerged |= mapWasUpdated; - } - return somethingWasMerged; - } - - /** - * Merge the next pair of events, if possible - * - * @param haplotypes a list of haplotypes whose events we want to merge - * @param ldCalculator calculates R^2 for pairs of events on demand - * @param startPosKeySet a set of starting positions of all events among the haplotypes - * @param ref the reference bases - * @param refLoc the span of the reference bases - * @return true if something was merged, false otherwise - */ - protected boolean mergeConsecutiveEventsBasedOnLDOnce( final List haplotypes, - final HaplotypeLDCalculator ldCalculator, - final int nSamples, - final TreeSet startPosKeySet, - final byte[] ref, - final GenomeLoc refLoc ) { - // loop over the set of start locations and consider pairs that start near each other - final Iterator iter = startPosKeySet.iterator(); - int thisStart = iter.next(); - while( iter.hasNext() ) { - final int nextStart = iter.next(); - final LDMergeData toMerge = getPairOfEventsToMerge(haplotypes, thisStart, nextStart); - - if ( toMerge.canBeMerged(nSamples) ) { - final double pPhased = ldCalculator.computeProbOfBeingPhased(toMerge.firstVC, toMerge.secondVC); - - if( DEBUG ) { - logger.info("Found consecutive biallelic events with R^2 = " + String.format("%.4f", pPhased)); - logger.info("-- " + toMerge.firstVC); - logger.info("-- " + toMerge.secondVC); - } - - if( pPhased > MERGE_EVENTS_PROB_PHASED_THRESHOLD) { - final VariantContext mergedVC = createMergedVariantContext(toMerge.firstVC, toMerge.secondVC, ref, refLoc); - // if for some reason the merging resulting in a bad allele, mergedVC will be null, and we will just remove first and second - replaceVariantContextsInMap(haplotypes, startPosKeySet, mergedVC, toMerge.firstVC, toMerge.secondVC); - return true; // break out of tree set iteration since it was just updated, start over from the beginning and keep merging events - } - } - - thisStart = nextStart; - } - - return false; - } - - /** - * Info about potential LD merge of two variant contexts - */ - private class LDMergeData { - VariantContext firstVC = null, secondVC = null; - boolean canBeMerged = true; - - /** Tell this object that it cant be merged for some reason */ - public LDMergeData cantBeMerged() { - canBeMerged = false; - return this; - } - - /** - * Can these two events be merged - * @param nSamples the number of samples we're considering - * @return true if we can merge our two variant contexts - */ - public boolean canBeMerged(final int nSamples) { - if ( ! canBeMerged || firstVC == null || secondVC == null ) - return false; - - final int distance = secondVC.getStart() - firstVC.getEnd(); - if ( firstVC.isSNP() && secondVC.isSNP() ) { - return nSamples >= minSamplesToMergeSNPs && distance <= MAX_DISTANCE_BETWEEN_SNPS_TO_MERGE; - } else { - return nSamples >= minSamplesToMergeOtherEvents && distance <= MAX_DISTANCE_BETWEEN_OTHER_EVENTS_TO_MERGE; - } - } - } - - /** - * Get the information about the potential merge of two events starting at thisStart and nextStart - * @param haplotypes our haplotypes - * @param thisStart the starting position of the first event to merge - * @param nextStart the starting position of the next event to merge - * @return never {@code null}. - */ - private LDMergeData getPairOfEventsToMerge(final List haplotypes, final int thisStart, final int nextStart) { - final LDMergeData mergeData = new LDMergeData(); - - for( final Haplotype h : haplotypes ) { - // only make complex substitutions out of consecutive biallelic sites - final VariantContext thisHapVC = h.getEventMap().get(thisStart); - if( thisHapVC != null && !thisHapVC.isSymbolic() ) { // something was found at this location on this haplotype - if( mergeData.firstVC == null ) { - mergeData.firstVC = thisHapVC; - } else if( !thisHapVC.hasSameAllelesAs( mergeData.firstVC) ) { - return mergeData.cantBeMerged(); - } - } - final VariantContext nextHapVC = h.getEventMap().get(nextStart); - if( nextHapVC != null && !nextHapVC.isSymbolic() ) { // something was found at the next location on this haplotype - if( mergeData.secondVC == null ) { - mergeData.secondVC = nextHapVC; - } else if( !nextHapVC.hasSameAllelesAs( mergeData.secondVC) ) { - return mergeData.cantBeMerged(); - } - } - } - - // don't try to merge overlapping events - if ( mergeData.firstVC != null && mergeData.secondVC != null && mergeData.firstVC.getEnd() >= mergeData.secondVC.getStart() ) - return mergeData.cantBeMerged(); - - return mergeData; - } - - // BUGBUG: make this merge function more general - protected VariantContext createMergedVariantContext( final VariantContext thisVC, final VariantContext nextVC, final byte[] ref, final GenomeLoc refLoc ) { - final int thisStart = thisVC.getStart(); - final int nextStart = nextVC.getStart(); - byte[] refBases = new byte[]{}; - byte[] altBases = new byte[]{}; - refBases = ArrayUtils.addAll(refBases, thisVC.getReference().getBases()); - altBases = ArrayUtils.addAll(altBases, thisVC.getAlternateAllele(0).getBases()); - int locus; - for( locus = thisStart + refBases.length; locus < nextStart; locus++ ) { - final byte refByte = ref[locus - refLoc.getStart()]; - refBases = ArrayUtils.add(refBases, refByte); - altBases = ArrayUtils.add(altBases, refByte); - } - refBases = ArrayUtils.addAll(refBases, ArrayUtils.subarray(nextVC.getReference().getBases(), locus > nextStart ? 1 : 0, nextVC.getReference().getBases().length)); // special case of deletion including the padding base of consecutive indel - altBases = ArrayUtils.addAll(altBases, nextVC.getAlternateAllele(0).getBases()); - - int iii = 0; - if( refBases.length == altBases.length ) { // insertion + deletion of same length creates an MNP --> trim common prefix bases off the beginning of the allele - while( iii < refBases.length && refBases[iii] == altBases[iii] ) { iii++; } - if ( iii == refBases.length ) { - // we've become a null allele, such as with CA/C + A/AA -> CA/CA => after trimming there's nothing left - // so return a null variant context so we can eliminate the variants from consideration - return null; - } - } - - - final Allele refAllele = Allele.create( ArrayUtils.subarray(refBases, iii, refBases.length), true ); - final Allele altAllele = Allele.create( ArrayUtils.subarray(altBases, iii, altBases.length), false ); - return new VariantContextBuilder("merged", thisVC.getChr(), thisVC.getStart() + iii, nextVC.getEnd(), Arrays.asList(refAllele, altAllele)).make(); - } - - /** - * Update the event maps in all haplotypes to replace a replacement of update1 and 2 with replacement - * - * @param haplotypes the haplotypes whose event maps we need to update - * @param startPosKeySet a sorted set of start positions that we must update - * @param replacement a VariantContext to replace update1 and update2 with. Can be null, indicating that we just want to remove update1 and update2 - * @param update1 the first VC we want to update - * @param update2 the second VC we want to update - */ - private void replaceVariantContextsInMap(final List haplotypes, - final TreeSet startPosKeySet, - final VariantContext replacement, - final VariantContext update1, final VariantContext update2) { - // remove the old event from the eventMap on every haplotype and the start pos key set, replace with merged event - for( final Haplotype h : haplotypes ) { - // if we had both events, add replacement. In some cases the haplotype may not have both - // events but they were still merged because the haplotype isn't a particularly informative - // haplotype in any case. The order of operations here is important because we are modifying the map - final boolean shouldAdd = h.getEventMap().containsKey(update1.getStart()) && h.getEventMap().containsKey(update2.getStart()); - h.getEventMap().remove(update1.getStart()); - h.getEventMap().remove(update2.getStart()); - if ( shouldAdd && replacement != null ) { - h.getEventMap().addVC(replacement, false); // cannot merge we other events at the same position - } - } - - startPosKeySet.remove(update1.getStart()); - startPosKeySet.remove(update2.getStart()); - if ( replacement != null ) startPosKeySet.add(replacement.getStart()); - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/MergeVariantsAcrossHaplotypes.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/MergeVariantsAcrossHaplotypes.java deleted file mode 100644 index 403e02988..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/MergeVariantsAcrossHaplotypes.java +++ /dev/null @@ -1,83 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.haplotype; - -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; - -import java.util.List; -import java.util.TreeSet; - -/** - * Baseclass for code that wants to merge variants together in the haplotype caller - * - * This root class is basically a no-op, and can be used to not do any merging - */ -public class MergeVariantsAcrossHaplotypes { - /** - * Merge variants across the haplotypes, updating the haplotype event maps and startPos set as appropriate - * - * @param haplotypes a list of haplotypes whose events we want to merge - * @param readLikelihoods map from sample name -> read likelihoods for each haplotype - * @param startPosKeySet a set of starting positions of all events among the haplotypes - * @param ref the reference bases - * @param refLoc the span of the reference bases - * @return true if anything was merged - */ - public boolean merge( final List haplotypes, - final ReadLikelihoods readLikelihoods, - final TreeSet startPosKeySet, - final byte[] ref, - final GenomeLoc refLoc ) { - return false; - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java index b3c096522..4edfd4f0f 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java @@ -53,7 +53,7 @@ package org.broadinstitute.gatk.utils.haplotypeBAMWriter; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMTag; -import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; +import org.broadinstitute.gatk.utils.sam.GATKSAMFileWriter; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/ReadDestination.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/ReadDestination.java index ec876bd03..567a3635e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/ReadDestination.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/ReadDestination.java @@ -54,7 +54,7 @@ package org.broadinstitute.gatk.utils.haplotypeBAMWriter; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFileWriter; import htsjdk.samtools.SAMReadGroupRecord; -import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; +import org.broadinstitute.gatk.utils.sam.GATKSAMFileWriter; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import java.util.ArrayList; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/DebugJNILoglessPairHMM.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/DebugJNILoglessPairHMM.java index c928f2891..eaaf1798f 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/DebugJNILoglessPairHMM.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/DebugJNILoglessPairHMM.java @@ -92,14 +92,14 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM { protected HashMap filenameToWriter = new HashMap(); private JNILoglessPairHMM jniPairHMM = null; - public DebugJNILoglessPairHMM(final PairHMM.HMM_IMPLEMENTATION hmmType) { + public DebugJNILoglessPairHMM(final PairHMM.HMM_IMPLEMENTATION hmmType, PairHMM.HMM_SUB_IMPLEMENTATION pairHMMSub, final boolean alwaysLoadVectorLoglessPairHMMLib) { super(); switch(hmmType) { case VECTOR_LOGLESS_CACHING: - jniPairHMM = new VectorLoglessPairHMM(); + jniPairHMM = new VectorLoglessPairHMM(pairHMMSub, alwaysLoadVectorLoglessPairHMMLib); break; default: - throw new UserException.BadArgumentValue("pairHMM","Specified JNIPairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are VECTOR_LOGLESS_CACHING"); + throw new UserException.BadArgumentValue("pairHMM","Specified JNIPairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are VECTOR_LOGLESS_CACHING"); } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/FastLoglessPairHMM.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/FastLoglessPairHMM.java index cf5168f44..28add0f64 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/FastLoglessPairHMM.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/FastLoglessPairHMM.java @@ -51,7 +51,6 @@ package org.broadinstitute.gatk.utils.pairhmm; -import org.broadinstitute.gatk.tools.walkers.haplotypecaller.PairHMMLikelihoodCalculationEngine; import org.broadinstitute.gatk.utils.QualityUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; @@ -398,7 +397,7 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM { for (int kkk = 0; kkk < readQuals.length; kkk++) { readQuals[kkk] = (byte) Math.min(0xff & readQuals[kkk], mq); // cap base quality by mapping - readQuals[kkk] = (byte) (readQuals[kkk] < PairHMMLikelihoodCalculationEngine.BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE + readQuals[kkk] = (byte) (readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readQuals[kkk])); readInsQuals[kkk] = (byte) Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readInsQuals[kkk]); readDelQuals[kkk] = (byte) Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readDelQuals[kkk]); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/VectorLoglessPairHMM.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/VectorLoglessPairHMM.java index 583f8d5d5..63cce9881 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/VectorLoglessPairHMM.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/VectorLoglessPairHMM.java @@ -51,6 +51,8 @@ package org.broadinstitute.gatk.utils.pairhmm; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; @@ -69,11 +71,7 @@ import java.util.Map; */ public class VectorLoglessPairHMM extends JNILoglessPairHMM { - //For machine capabilities - public static final long sse41Mask = 1; - public static final long sse42Mask = 2; - public static final long avxMask = 4; - public static final long enableAll = 0xFFFFFFFFFFFFFFFFl; + protected final static Logger logger = Logger.getLogger(VectorLoglessPairHMM.class); //Used to copy references to byteArrays to JNI from reads protected class JNIReadDataHolderClass { @@ -97,25 +95,33 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM { * Bit 2 represents AVX availability */ public native long jniGetMachineType(); - + /** * Function to initialize the fields of JNIReadDataHolderClass and JNIHaplotypeDataHolderClass from JVM. * C++ codegets FieldIDs for these classes once and re-uses these IDs for the remainder of the program. Field IDs do not * change per JVM session - * @param readDataHolderClass class type of JNIReadDataHolderClass + * + * @param readDataHolderClass class type of JNIReadDataHolderClass * @param haplotypeDataHolderClass class type of JNIHaplotypeDataHolderClass - * @param mask mask is a 64 bit integer identical to the one received from jniGetMachineType(). Users can disable usage of some hardware features by zeroing some bits in the mask - * */ + * @param mask a 64 bit integer identical to the one received from jniGetMachineType(). Users can disable usage of some hardware features by zeroing bits in the mask + */ private native void jniInitializeClassFieldsAndMachineMask(Class readDataHolderClass, Class haplotypeDataHolderClass, long mask); private static Boolean isVectorLoglessPairHMMLibraryLoaded = false; + //The constructor is called only once inside PairHMMLikelihoodCalculationEngine - public VectorLoglessPairHMM() { + public VectorLoglessPairHMM(final PairHMM.HMM_SUB_IMPLEMENTATION pairHMMSub, final boolean alwaysLoadVectorLoglessPairHMMLib) throws UserException.HardwareFeatureException { super(); - synchronized(isVectorLoglessPairHMMLibraryLoaded) { - //Load the library and initialize the FieldIDs - if(!isVectorLoglessPairHMMLibraryLoaded) { + synchronized (isVectorLoglessPairHMMLibraryLoaded) { + // Get the mask for the requested hardware sub-implementation + // If a specifically requested hardware feature can not be supported, throw an exception + long mask = pairHMMSub.getMask(); + throwIfHardwareFeatureNotSupported(mask, pairHMMSub); + + // Load the library and initialize the FieldIDs + // Load if not loaded or if the the always load flag is true + if (!isVectorLoglessPairHMMLibraryLoaded || alwaysLoadVectorLoglessPairHMMLib) { try { //Try loading from Java's library path first @@ -123,51 +129,53 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM { //implementation without modifying the Java code System.loadLibrary("VectorLoglessPairHMM"); logger.info("libVectorLoglessPairHMM found in JVM library path"); - } - catch(UnsatisfiedLinkError ule) - { + } catch (UnsatisfiedLinkError ule) { //Could not load from Java's library path - try unpacking from jar try { logger.debug("libVectorLoglessPairHMM not found in JVM library path - trying to unpack from GATK jar file"); loadLibraryFromJar("/org/broadinstitute/gatk/utils/pairhmm/libVectorLoglessPairHMM.so"); logger.info("libVectorLoglessPairHMM unpacked successfully from GATK jar file"); - } - catch(IOException ioe) - { + } catch (IOException ioe) { //Throw the UnsatisfiedLinkError to make it clear to the user what failed throw ule; } } logger.info("Using vectorized implementation of PairHMM"); isVectorLoglessPairHMMLibraryLoaded = true; - jniInitializeClassFieldsAndMachineMask(JNIReadDataHolderClass.class, JNIHaplotypeDataHolderClass.class, enableAll); //need to do this only once + + //need to do this only once + jniInitializeClassFieldsAndMachineMask(JNIReadDataHolderClass.class, JNIHaplotypeDataHolderClass.class, mask); } } } - private native void jniInitializeHaplotypes(final int numHaplotypes, JNIHaplotypeDataHolderClass[] haplotypeDataArray); + private native void jniInitializeHaplotypes(final int numHaplotypes, JNIHaplotypeDataHolderClass[] haplotypeDataArray); + //Hold the mapping between haplotype and index in the list of Haplotypes passed to initialize //Use this mapping in computeLikelihoods to find the likelihood value corresponding to a given Haplotype - private HashMap haplotypeToHaplotypeListIdxMap = new HashMap<>(); + private HashMap haplotypeToHaplotypeListIdxMap = new HashMap<>(); private JNIHaplotypeDataHolderClass[] mHaplotypeDataArray; + @Override - public HashMap getHaplotypeToHaplotypeListIdxMap() { return haplotypeToHaplotypeListIdxMap; } - + public HashMap getHaplotypeToHaplotypeListIdxMap() { + return haplotypeToHaplotypeListIdxMap; + } + //Used to transfer data to JNI //Since the haplotypes are the same for all calls to computeLikelihoods within a region, transfer the haplotypes only once to the JNI per region + /** * {@inheritDoc} */ @Override - public void initialize( final List haplotypes, final Map> perSampleReadList, - final int readMaxLength, final int haplotypeMaxLength ) { + public void initialize(final List haplotypes, final Map> perSampleReadList, + final int readMaxLength, final int haplotypeMaxLength) { int numHaplotypes = haplotypes.size(); mHaplotypeDataArray = new JNIHaplotypeDataHolderClass[numHaplotypes]; int idx = 0; haplotypeToHaplotypeListIdxMap.clear(); - for(final Haplotype currHaplotype : haplotypes) - { + for (final Haplotype currHaplotype : haplotypes) { mHaplotypeDataArray[idx] = new JNIHaplotypeDataHolderClass(); mHaplotypeDataArray[idx].haplotypeBases = currHaplotype.getBases(); haplotypeToHaplotypeListIdxMap.put(currHaplotype, idx); @@ -175,6 +183,7 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM { } jniInitializeHaplotypes(numHaplotypes, mHaplotypeDataArray); } + /** * Tell JNI to release arrays - really important if native code is directly accessing Java memory, if not * accessing Java memory directly, still important to release memory from C++ @@ -194,22 +203,22 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM { * Real compute kernel */ private native void jniComputeLikelihoods(int numReads, int numHaplotypes, JNIReadDataHolderClass[] readDataArray, - JNIHaplotypeDataHolderClass[] haplotypeDataArray, double[] likelihoodArray, int maxNumThreadsToUse); + JNIHaplotypeDataHolderClass[] haplotypeDataArray, double[] likelihoodArray, int maxNumThreadsToUse); + /** * {@inheritDoc} */ @Override - public void computeLikelihoods( final ReadLikelihoods.Matrix likelihoods, final List processedReads, final Map gcp ) { + public void computeLikelihoods(final ReadLikelihoods.Matrix likelihoods, final List processedReads, final Map gcp) { if (processedReads.isEmpty()) return; - if(doProfiling) + if (doProfiling) startTime = System.nanoTime(); int readListSize = processedReads.size(); int numHaplotypes = likelihoods.alleleCount(); JNIReadDataHolderClass[] readDataArray = new JNIReadDataHolderClass[readListSize]; int idx = 0; - for(GATKSAMRecord read : processedReads) - { + for (GATKSAMRecord read : processedReads) { readDataArray[idx] = new JNIReadDataHolderClass(); readDataArray[idx].readBases = read.getReadBases(); readDataArray[idx].readQuals = read.getBaseQualities(); @@ -219,8 +228,8 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM { ++idx; } - mLikelihoodArray = new double[readListSize*numHaplotypes]; //to store results - if(doProfiling) + mLikelihoodArray = new double[readListSize * numHaplotypes]; //to store results + if (doProfiling) threadLocalSetupTimeDiff = (System.nanoTime() - startTime); //for(reads) // for(haplotypes) @@ -228,21 +237,19 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM { jniComputeLikelihoods(readListSize, numHaplotypes, readDataArray, mHaplotypeDataArray, mLikelihoodArray, 12); int readIdx = 0; - for(int r = 0; r < readListSize; r++) - { + for (int r = 0; r < readListSize; r++) { int hapIdx = 0; for (final Haplotype haplotype : likelihoods.alleles()) { //Since the order of haplotypes in the List and alleleHaplotypeMap is different, //get idx of current haplotype in the list and use this idx to get the right likelihoodValue final int idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(haplotype); - likelihoods.set(hapIdx,r,mLikelihoodArray[readIdx + idxInsideHaplotypeList]); + likelihoods.set(hapIdx, r, mLikelihoodArray[readIdx + idxInsideHaplotypeList]); ++hapIdx; } readIdx += numHaplotypes; } - if(doProfiling) - { + if (doProfiling) { threadLocalPairHMMComputeTimeDiff = (System.nanoTime() - startTime); //synchronized(doProfiling) { @@ -251,29 +258,30 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM { } } } - + /** * Print final profiling information from native code */ public native void jniClose(); + @Override - public void close() - { - if(doProfiling) - System.out.println("Time spent in setup for JNI call : "+(pairHMMSetupTime*1e-9)); + public void close() { + if (doProfiling) + logger.info("Time spent in setup for JNI call : " + (pairHMMSetupTime * 1e-9)); super.close(); jniClose(); } - + //Copied from http://frommyplayground.com/how-to-load-native-jni-library-from-jar + /** * Loads library from current JAR archive - * + *

    * The file from JAR is copied into system temporary directory and then loaded. The temporary file is deleted after exiting. * Method uses String as filename because the pathname is "abstract", not system-dependent. - * + * * @param path The filename inside JAR as absolute path (beginning with '/'), e.g. /package/File.ext - * @throws IOException If temporary file creation or read/write operation fails + * @throws IOException If temporary file creation or read/write operation fails * @throws IllegalArgumentException If source file (param path) does not exist * @throws IllegalArgumentException If the path is not absolute or if the filename is shorter than three characters (restriction of {@see File#createTempFile(java.lang.String, java.lang.String)}). */ @@ -293,7 +301,7 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM { if (filename != null) { parts = filename.split("\\.", 2); prefix = parts[0]; - suffix = (parts.length > 1) ? "."+parts[parts.length - 1] : null; // Thanks, davs! :-) + suffix = (parts.length > 1) ? "." + parts[parts.length - 1] : null; // Thanks, davs! :-) } // Check if the filename is okay @@ -335,4 +343,36 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM { // Finally, load the library System.load(temp.getAbsolutePath()); } + + /** + * If the machine does not support the requested hardware feature, throw an exception + *

    + * If requesting a specific hardware feature, check if the machine supports this feature. + * If it does not, throw an exception. + * + * @param mask a 64 bit integer identical to the one received from jniGetMachineType(). Users can disable usage of some hardware features by zeroing some bits in the mask. + * @param pairHMMSub the PairHMM machine dependent sub-implementation to use for genotype likelihood calculations + * @throws UserException.HardwareFeatureException if the hardware feature is not supported + */ + private void throwIfHardwareFeatureNotSupported(long mask, PairHMM.HMM_SUB_IMPLEMENTATION pairHMMSub) throws UserException.HardwareFeatureException + { + if ( pairHMMSub.getIsSpecificHardwareRequest() ) { + if ( !isHardwareFeatureSupported(mask) ) + throw new UserException.HardwareFeatureException("Machine does not support pairHMM hardware dependent sub-type = " + pairHMMSub); + } + } + + /** + * Check if the machine supports the requested hardware feature + *

    + * Mask the bits for the hardware feature and check if they are set by the machine + * If the bits are set, the machine supports this feature + * + * @param mask a 64 bit integer identical to the one received from jniGetMachineType(). Users can disable usage of some hardware features by zeroing some bits in the mask. + * @return true of machine supports the requested hardware feature, false otherwise + */ + private boolean isHardwareFeatureSupported(long mask) + { + return (mask & jniGetMachineType()) != 0x0; + } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/BQSRReadTransformer.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/BQSRReadTransformer.java deleted file mode 100644 index 5b342c8be..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/BQSRReadTransformer.java +++ /dev/null @@ -1,104 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.WalkerManager; -import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -/** - * A ReadTransformer that applies BQSR on the fly to reads - * - * User: rpoplin - * Date: 2/13/12 - */ -public class BQSRReadTransformer extends ReadTransformer { - private boolean enabled; - private BaseRecalibration bqsr = null; - - @Override - public OrderingConstraint getOrderingConstraint() { return OrderingConstraint.MUST_BE_FIRST; } - - @Override - public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { - this.enabled = engine.hasBQSRArgumentSet(); - if ( enabled ) { - // TODO -- See important note below about applying BQSR to a reduced BAM file: - // If it is important to make sure that BQSR is not applied (as opposed to having the covariates computed) against a reduced bam file, - // we need to figure out how to make this work. The problem is that the ReadTransformers are initialized before the ReadDataSource - // inside the GenomeAnalysisEngine, so we generate a NPE when trying to retrieve the SAMFileHeaders. Ultimately, I don't think this is - // a necessary check anyways since we disallow running BaseRecalibrator on reduced bams (so we can't generate the recal tables to use here). - // Although we could add this check to the apply() method below, it's kind of ugly and inefficient. - // The call here would be: RecalUtils.checkForInvalidRecalBams(engine.getSAMFileHeaders(), engine.getArguments().ALLOW_BQSR_ON_REDUCED_BAMS); - final BQSRArgumentSet args = engine.getBQSRArgumentSet(); - this.bqsr = new BaseRecalibration(args.getRecalFile(), args.getQuantizationLevels(), args.shouldDisableIndelQuals(), args.getPreserveQscoresLessThan(), args.shouldEmitOriginalQuals(), args.getGlobalQScorePrior()); - } - final BQSRMode mode = WalkerManager.getWalkerAnnotation(walker, BQSRMode.class); - return mode.ApplicationTime(); - } - - @Override - public boolean enabled() { - return enabled; - } - - /** - * initialize a new BQSRReadTransformer that applies BQSR on the fly to incoming reads. - */ - @Override - public GATKSAMRecord apply(GATKSAMRecord read) { - bqsr.recalibrateRead(read); - return read; - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/BaseRecalibration.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/BaseRecalibration.java deleted file mode 100644 index be64dc4de..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/BaseRecalibration.java +++ /dev/null @@ -1,207 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import com.google.java.contract.Ensures; -import htsjdk.samtools.SAMTag; -import htsjdk.samtools.SAMUtils; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.recalibration.covariates.Covariate; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.io.File; -import java.util.ArrayList; -import java.util.List; - -/** - * Utility methods to facilitate on-the-fly base quality score recalibration. - * - * User: carneiro and rpoplin - * Date: 2/4/12 - */ - -public class BaseRecalibration { - private static Logger logger = Logger.getLogger(BaseRecalibration.class); - private final static boolean TEST_CACHING = false; - - private final QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done) - private final RecalibrationTables recalibrationTables; - private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation - - private final boolean disableIndelQuals; - private final int preserveQLessThan; - private final double globalQScorePrior; - private final boolean emitOriginalQuals; - - /** - * Constructor using a GATK Report file - * - * @param RECAL_FILE a GATK Report file containing the recalibration information - * @param quantizationLevels number of bins to quantize the quality scores - * @param disableIndelQuals if true, do not emit base indel qualities - * @param preserveQLessThan preserve quality scores less than this value - */ - public BaseRecalibration(final File RECAL_FILE, final int quantizationLevels, final boolean disableIndelQuals, final int preserveQLessThan, final boolean emitOriginalQuals, final double globalQScorePrior) { - RecalibrationReport recalibrationReport = new RecalibrationReport(RECAL_FILE); - - recalibrationTables = recalibrationReport.getRecalibrationTables(); - requestedCovariates = recalibrationReport.getRequestedCovariates(); - quantizationInfo = recalibrationReport.getQuantizationInfo(); - if (quantizationLevels == 0) // quantizationLevels == 0 means no quantization, preserve the quality scores - quantizationInfo.noQuantization(); - else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wants to use what's in the report. - quantizationInfo.quantizeQualityScores(quantizationLevels); - - this.disableIndelQuals = disableIndelQuals; - this.preserveQLessThan = preserveQLessThan; - this.globalQScorePrior = globalQScorePrior; - this.emitOriginalQuals = emitOriginalQuals; - } - - /** - * Recalibrates the base qualities of a read - * - * It updates the base qualities of the read with the new recalibrated qualities (for all event types) - * - * Implements a serial recalibration of the reads using the combinational table. - * First, we perform a positional recalibration, and then a subsequent dinuc correction. - * - * Given the full recalibration table, we perform the following preprocessing steps: - * - * - calculate the global quality score shift across all data [DeltaQ] - * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift - * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual - * - The final shift equation is: - * - * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) - * - * @param read the read to recalibrate - */ - public void recalibrateRead(final GATKSAMRecord read) { - if (emitOriginalQuals && read.getAttribute(SAMTag.OQ.name()) == null) { // Save the old qualities if the tag isn't already taken in the read - try { - read.setAttribute(SAMTag.OQ.name(), SAMUtils.phredToFastq(read.getBaseQualities())); - } catch (IllegalArgumentException e) { - throw new UserException.MalformedBAM(read, "illegal base quality encountered; " + e.getMessage()); - } - } - - final ReadCovariates readCovariates = RecalUtils.computeCovariates(read, requestedCovariates); - final int readLength = read.getReadLength(); - - for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings - if (disableIndelQuals && errorModel != EventType.BASE_SUBSTITUTION) { - read.setBaseQualities(null, errorModel); - continue; - } - - final byte[] quals = read.getBaseQualities(errorModel); - - // get the keyset for this base using the error model - final int[][] fullReadKeySet = readCovariates.getKeySet(errorModel); - - // the rg key is constant over the whole read, the global deltaQ is too - final int rgKey = fullReadKeySet[0][0]; - final RecalDatum empiricalQualRG = recalibrationTables.getReadGroupTable().get(rgKey, errorModel.ordinal()); - - if( empiricalQualRG != null ) { - final double epsilon = ( globalQScorePrior > 0.0 && errorModel.equals(EventType.BASE_SUBSTITUTION) ? globalQScorePrior : empiricalQualRG.getEstimatedQReported() ); - - for (int offset = 0; offset < readLength; offset++) { // recalibrate all bases in the read - final byte origQual = quals[offset]; - - // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality) - if ( origQual >= preserveQLessThan ) { - // get the keyset for this base using the error model - final int[] keySet = fullReadKeySet[offset]; - final RecalDatum empiricalQualQS = recalibrationTables.getQualityScoreTable().get(keySet[0], keySet[1], errorModel.ordinal()); - final List empiricalQualCovs = new ArrayList(); - for (int i = 2; i < requestedCovariates.length; i++) { - if (keySet[i] < 0) { - continue; - } - empiricalQualCovs.add(recalibrationTables.getTable(i).get(keySet[0], keySet[1], keySet[i], errorModel.ordinal())); - } - - double recalibratedQualDouble = hierarchicalBayesianQualityEstimate( epsilon, empiricalQualRG, empiricalQualQS, empiricalQualCovs ); - - // recalibrated quality is bound between 1 and MAX_QUAL - final byte recalibratedQual = QualityUtils.boundQual(MathUtils.fastRound(recalibratedQualDouble), RecalDatum.MAX_RECALIBRATED_Q_SCORE); - - // return the quantized version of the recalibrated quality - final byte recalibratedQualityScore = quantizationInfo.getQuantizedQuals().get(recalibratedQual); - - quals[offset] = recalibratedQualityScore; - } - } - } - - // finally update the base qualities in the read - read.setBaseQualities(quals, errorModel); - } - } - - @Ensures("result > 0.0") - protected static double hierarchicalBayesianQualityEstimate( final double epsilon, final RecalDatum empiricalQualRG, final RecalDatum empiricalQualQS, final List empiricalQualCovs ) { - final double globalDeltaQ = ( empiricalQualRG == null ? 0.0 : empiricalQualRG.getEmpiricalQuality(epsilon) - epsilon ); - final double deltaQReported = ( empiricalQualQS == null ? 0.0 : empiricalQualQS.getEmpiricalQuality(globalDeltaQ + epsilon) - (globalDeltaQ + epsilon) ); - double deltaQCovariates = 0.0; - for( final RecalDatum empiricalQualCov : empiricalQualCovs ) { - deltaQCovariates += ( empiricalQualCov == null ? 0.0 : empiricalQualCov.getEmpiricalQuality(deltaQReported + globalDeltaQ + epsilon) - (deltaQReported + globalDeltaQ + epsilon) ); - } - - return epsilon + globalDeltaQ + deltaQReported + deltaQCovariates; - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/QualQuantizer.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/QualQuantizer.java deleted file mode 100644 index c33089449..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/QualQuantizer.java +++ /dev/null @@ -1,500 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.report.GATKReport; -import org.broadinstitute.gatk.engine.report.GATKReportTable; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.io.PrintStream; -import java.util.*; - -/** - * A general algorithm for quantizing quality score distributions to use a specific number of levels - * - * Takes a histogram of quality scores and a desired number of levels and produces a - * map from original quality scores -> quantized quality scores. - * - * Note that this data structure is fairly heavy-weight, holding lots of debugging and - * calculation information. If you want to use it efficiently at scale with lots of - * read groups the right way to do this: - * - * Map> map - * for each read group rg: - * hist = getQualHist(rg) - * QualQuantizer qq = new QualQuantizer(hist, nLevels, minInterestingQual) - * map.set(rg, qq.getOriginalToQuantizedMap()) - * - * This map would then be used to look up the appropriate original -> quantized - * quals for each read as it comes in. - * - * @author Mark Depristo - * @since 3/2/12 - */ -public class QualQuantizer { - final private static Set MY_EMPTY_SET = Collections.emptySet(); - - private static Logger logger = Logger.getLogger(QualQuantizer.class); - - /** - * Inputs to the QualQuantizer - */ - final int nLevels, minInterestingQual; - final List nObservationsPerQual; - - /** - * Map from original qual (e.g., Q30) to new quantized qual (e.g., Q28). - * - * Has the same range as nObservationsPerQual - */ - final List originalToQuantizedMap; - - /** Sorted set of qual intervals. - * - * After quantize() this data structure contains only the top-level qual intervals - */ - final TreeSet quantizedIntervals; - - /** - * Protected creator for testng use only - */ - protected QualQuantizer(final int minInterestingQual) { - this.nObservationsPerQual = Collections.emptyList(); - this.nLevels = 0; - this.minInterestingQual = minInterestingQual; - this.quantizedIntervals = null; - this.originalToQuantizedMap = null; - } - - /** - * Creates a QualQuantizer for the histogram that has nLevels - * - * Note this is the only interface to the system. After creating this object - * the map can be obtained via getOriginalToQuantizedMap() - * - * @param nObservationsPerQual A histogram of counts of bases with quality scores. Note that - * this histogram must start at 0 (i.e., get(0) => count of Q0 bases) and must include counts all the - * way up to the largest quality score possible in the reads. OK if the histogram includes many 0 - * count bins, as these are quantized for free. - * @param nLevels the desired number of distinct quality scores to represent the full original range. Must - * be at least 1. - * @param minInterestingQual All quality scores <= this value are considered uninteresting and are freely - * merged together. For example, if this value is 10, then Q0-Q10 are all considered free to merge, and - * quantized into a single value. For ILMN data with lots of Q2 bases this results in a Q2 bin containing - * all data with Q0-Q10. - */ - public QualQuantizer(final List nObservationsPerQual, final int nLevels, final int minInterestingQual) { - this.nObservationsPerQual = nObservationsPerQual; - this.nLevels = nLevels; - this.minInterestingQual = minInterestingQual; - - // some sanity checking - if ( Collections.min(nObservationsPerQual) < 0 ) throw new ReviewedGATKException("Quality score histogram has negative values at: " + Utils.join(", ", nObservationsPerQual)); - if ( nLevels < 0 ) throw new ReviewedGATKException("nLevels must be >= 0"); - if ( minInterestingQual < 0 ) throw new ReviewedGATKException("minInterestingQual must be >= 0"); - - // actually run the quantizer - this.quantizedIntervals = quantize(); - - // store the map - this.originalToQuantizedMap = intervalsToMap(quantizedIntervals); - } - - /** - * Represents an contiguous interval of quality scores. - * - * qStart and qEnd are inclusive, so qStart = qEnd = 2 is the quality score bin of 2 - */ - @Invariant({ - "qStart <= qEnd", - "qStart >= 0", - "qEnd <= 1000", - "nObservations >= 0", - "nErrors >= 0", - "nErrors <= nObservations", - "fixedQual >= -1 && fixedQual <= QualityUtils.MAX_SAM_QUAL_SCORE", - "mergeOrder >= 0"}) - protected final class QualInterval implements Comparable { - final int qStart, qEnd, fixedQual, level; - final long nObservations, nErrors; - final Set subIntervals; - - /** for debugging / visualization. When was this interval created? */ - int mergeOrder; - - protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level) { - this(qStart, qEnd, nObservations, nErrors, level, -1, MY_EMPTY_SET); - } - - protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final Set subIntervals) { - this(qStart, qEnd, nObservations, nErrors, level, -1, subIntervals); - } - - protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final int fixedQual) { - this(qStart, qEnd, nObservations, nErrors, level, fixedQual, MY_EMPTY_SET); - } - - @Requires("level >= 0") - public QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final int fixedQual, final Set subIntervals) { - this.qStart = qStart; - this.qEnd = qEnd; - this.nObservations = nObservations; - this.nErrors = nErrors; - this.fixedQual = fixedQual; - this.level = level; - this.mergeOrder = 0; - this.subIntervals = Collections.unmodifiableSet(subIntervals); - } - - /** - * @return Human readable name of this interval: e.g., 10-12 - */ - public String getName() { - return qStart + "-" + qEnd; - } - - @Override - public String toString() { - return "QQ:" + getName(); - } - - /** - * @return the error rate (in real space) of this interval, or 0 if there are no observations - */ - @Ensures("result >= 0.0") - public double getErrorRate() { - if ( hasFixedQual() ) - return QualityUtils.qualToErrorProb((byte)fixedQual); - else if ( nObservations == 0 ) - return 0.0; - else - return (nErrors+1) / (1.0 * (nObservations+1)); - } - - /** - * @return the QUAL of the error rate of this interval, or the fixed qual if this interval was created with a fixed qual. - */ - @Ensures("result >= 0 && result <= QualityUtils.MAX_SAM_QUAL_SCORE") - public byte getQual() { - if ( ! hasFixedQual() ) - return QualityUtils.errorProbToQual(getErrorRate()); - else - return (byte)fixedQual; - } - - /** - * @return true if this bin is using a fixed qual - */ - public boolean hasFixedQual() { - return fixedQual != -1; - } - - @Override - public int compareTo(final QualInterval qualInterval) { - return Integer.valueOf(this.qStart).compareTo(qualInterval.qStart); - } - - /** - * Create a interval representing the merge of this interval and toMerge - * - * Errors and observations are combined - * Subintervals updated in order of left to right (determined by qStart) - * Level is 1 + highest level of this and toMerge - * Order must be updated elsewhere - * - * @param toMerge - * @return newly created merged QualInterval - */ - @Requires({"toMerge != null"}) - @Ensures({ - "result != null", - "result.nObservations >= this.nObservations", - "result.nObservations >= toMerge.nObservations", - "result.nErrors >= this.nErrors", - "result.nErrors >= toMerge.nErrors", - "result.qStart == Math.min(this.qStart, toMerge.qStart)", - "result.qEnd == Math.max(this.qEnd, toMerge.qEnd)", - "result.level > Math.max(this.level, toMerge.level)", - "result.subIntervals.size() == 2" - }) - public QualInterval merge(final QualInterval toMerge) { - final QualInterval left = this.compareTo(toMerge) < 0 ? this : toMerge; - final QualInterval right = this.compareTo(toMerge) < 0 ? toMerge : this; - - if ( left.qEnd + 1 != right.qStart ) - throw new ReviewedGATKException("Attempting to merge non-contiguous intervals: left = " + left + " right = " + right); - - final long nCombinedObs = left.nObservations + right.nObservations; - final long nCombinedErr = left.nErrors + right.nErrors; - - final int level = Math.max(left.level, right.level) + 1; - final Set subIntervals = new HashSet(Arrays.asList(left, right)); - QualInterval merged = new QualInterval(left.qStart, right.qEnd, nCombinedObs, nCombinedErr, level, subIntervals); - - return merged; - } - - public double getPenalty() { - return calcPenalty(getErrorRate()); - } - - - /** - * Calculate the penalty of this interval, given the overall error rate for the interval - * - * If the globalErrorRate is e, this value is: - * - * sum_i |log10(e_i) - log10(e)| * nObservations_i - * - * each the index i applies to all leaves of the tree accessible from this interval - * (found recursively from subIntervals as necessary) - * - * @param globalErrorRate overall error rate in real space against which we calculate the penalty - * @return the cost of approximating the bins in this interval with the globalErrorRate - */ - @Requires("globalErrorRate >= 0.0") - @Ensures("result >= 0.0") - private double calcPenalty(final double globalErrorRate) { - if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty - return 0.0; - - if ( subIntervals.isEmpty() ) { - // this is leave node - if ( this.qEnd <= minInterestingQual ) - // It's free to merge up quality scores below the smallest interesting one - return 0; - else { - return (Math.abs(Math.log10(getErrorRate()) - Math.log10(globalErrorRate))) * nObservations; - } - } else { - double sum = 0; - for ( final QualInterval interval : subIntervals ) - sum += interval.calcPenalty(globalErrorRate); - return sum; - } - } - } - - /** - * Main method for computing the quantization intervals. - * - * Invoked in the constructor after all input variables are initialized. Walks - * over the inputs and builds the min. penalty forest of intervals with exactly nLevel - * root nodes. Finds this min. penalty forest via greedy search, so is not guarenteed - * to find the optimal combination. - * - * TODO: develop a smarter algorithm - * - * @return the forest of intervals with size == nLevels - */ - @Ensures({"! result.isEmpty()", "result.size() == nLevels"}) - private TreeSet quantize() { - // create intervals for each qual individually - final TreeSet intervals = new TreeSet(); - for ( int qStart = 0; qStart < getNQualsInHistogram(); qStart++ ) { - final long nObs = nObservationsPerQual.get(qStart); - final double errorRate = QualityUtils.qualToErrorProb((byte)qStart); - final double nErrors = nObs * errorRate; - final QualInterval qi = new QualInterval(qStart, qStart, nObs, (int)Math.floor(nErrors), 0, (byte)qStart); - intervals.add(qi); - } - - // greedy algorithm: - // while ( n intervals >= nLevels ): - // find intervals to merge with least penalty - // merge it - while ( intervals.size() > nLevels ) { - mergeLowestPenaltyIntervals(intervals); - } - - return intervals; - } - - /** - * Helper function that finds and merges together the lowest penalty pair of intervals - * @param intervals - */ - @Requires("! intervals.isEmpty()") - private void mergeLowestPenaltyIntervals(final TreeSet intervals) { - // setup the iterators - final Iterator it1 = intervals.iterator(); - final Iterator it1p = intervals.iterator(); - it1p.next(); // skip one - - // walk over the pairs of left and right, keeping track of the pair with the lowest merge penalty - QualInterval minMerge = null; - if ( logger.isDebugEnabled() ) logger.debug("mergeLowestPenaltyIntervals: " + intervals.size()); - int lastMergeOrder = 0; - while ( it1p.hasNext() ) { - final QualInterval left = it1.next(); - final QualInterval right = it1p.next(); - final QualInterval merged = left.merge(right); - lastMergeOrder = Math.max(Math.max(lastMergeOrder, left.mergeOrder), right.mergeOrder); - if ( minMerge == null || (merged.getPenalty() < minMerge.getPenalty() ) ) { - if ( logger.isDebugEnabled() ) logger.debug(" Updating merge " + minMerge); - minMerge = merged; - } - } - - // now actually go ahead and merge the minMerge pair - if ( logger.isDebugEnabled() ) logger.debug(" => final min merge " + minMerge); - intervals.removeAll(minMerge.subIntervals); - intervals.add(minMerge); - minMerge.mergeOrder = lastMergeOrder + 1; - if ( logger.isDebugEnabled() ) logger.debug("updated intervals: " + intervals); - } - - /** - * Given a final forest of intervals constructs a list mapping - * list.get(i) => quantized qual to use for original quality score i - * - * This function should be called only once to initialize the corresponding - * cached value in this object, as the calculation is a bit costly. - * - * @param intervals - * @return - */ - @Ensures("result.size() == getNQualsInHistogram()") - private List intervalsToMap(final TreeSet intervals) { - final List map = new ArrayList(getNQualsInHistogram()); - map.addAll(Collections.nCopies(getNQualsInHistogram(), Byte.MIN_VALUE)); - for ( final QualInterval interval : intervals ) { - for ( int q = interval.qStart; q <= interval.qEnd; q++ ) { - map.set(q, interval.getQual()); - } - } - - if ( Collections.min(map) == Byte.MIN_VALUE ) - throw new ReviewedGATKException("quantized quality score map contains an un-initialized value"); - - return map; - } - - @Ensures("result > 0") - private final int getNQualsInHistogram() { - return nObservationsPerQual.size(); - } - - /** - * Write out a GATKReport to visualize the QualQuantization process of this data - * @param out - */ - public void writeReport(PrintStream out) { - final GATKReport report = new GATKReport(); - - addQualHistogramToReport(report); - addIntervalsToReport(report); - - report.print(out); - } - - private final void addQualHistogramToReport(final GATKReport report) { - report.addTable("QualHistogram", "Quality score histogram provided to report", 2); - GATKReportTable table = report.getTable("QualHistogram"); - - table.addColumn("qual"); - table.addColumn("count"); - - for ( int q = 0; q < nObservationsPerQual.size(); q++ ) { - table.set(q, "qual", q); - table.set(q, "count", nObservationsPerQual.get(q)); - } - } - - - private final void addIntervalsToReport(final GATKReport report) { - report.addTable("QualQuantizerIntervals", "Table of QualQuantizer quantization intervals", 10); - GATKReportTable table = report.getTable("QualQuantizerIntervals"); - - table.addColumn("name"); - table.addColumn("qStart"); - table.addColumn("qEnd"); - table.addColumn("level"); - table.addColumn("merge.order"); - table.addColumn("nErrors"); - table.addColumn("nObservations"); - table.addColumn("qual"); - table.addColumn("penalty"); - table.addColumn("root.node"); - //table.addColumn("subintervals", "NA"); - - for ( QualInterval interval : quantizedIntervals ) - addIntervalToReport(table, interval, true); - } - - private final void addIntervalToReport(final GATKReportTable table, final QualInterval interval, final boolean atRootP) { - final String name = interval.getName(); - table.set(name, "name", name); - table.set(name, "qStart", interval.qStart); - table.set(name, "qEnd", interval.qEnd); - table.set(name, "level", interval.level); - table.set(name, "merge.order", interval.mergeOrder); - table.set(name, "nErrors", interval.nErrors); - table.set(name, "nObservations", interval.nObservations); - table.set(name, "qual", interval.getQual()); - table.set(name, "penalty", String.format("%.1f", interval.getPenalty())); - table.set(name, "root.node", atRootP); - - for ( final QualInterval sub : interval.subIntervals ) - addIntervalToReport(table, sub, false); - } - - public List getOriginalToQuantizedMap() { - return originalToQuantizedMap; - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/QuantizationInfo.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/QuantizationInfo.java deleted file mode 100644 index 001643b07..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/QuantizationInfo.java +++ /dev/null @@ -1,151 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.engine.report.GATKReportTable; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; - -import java.util.Arrays; -import java.util.List; - -/** - * Class that encapsulates the information necessary for quality score quantization for BQSR - * - * @author carneiro - * @since 3/26/12 - */ -public class QuantizationInfo { - private List quantizedQuals; - private List empiricalQualCounts; - private int quantizationLevels; - - private QuantizationInfo(List quantizedQuals, List empiricalQualCounts, int quantizationLevels) { - this.quantizedQuals = quantizedQuals; - this.empiricalQualCounts = empiricalQualCounts; - this.quantizationLevels = quantizationLevels; - } - - public QuantizationInfo(List quantizedQuals, List empiricalQualCounts) { - this(quantizedQuals, empiricalQualCounts, calculateQuantizationLevels(quantizedQuals)); - } - - public QuantizationInfo(final RecalibrationTables recalibrationTables, final int quantizationLevels) { - final Long [] qualHistogram = new Long[QualityUtils.MAX_SAM_QUAL_SCORE +1]; // create a histogram with the empirical quality distribution - for (int i = 0; i < qualHistogram.length; i++) - qualHistogram[i] = 0L; - - final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); // get the quality score table - - for (final RecalDatum value : qualTable.getAllValues()) { - final RecalDatum datum = value; - final int empiricalQual = MathUtils.fastRound(datum.getEmpiricalQuality()); // convert the empirical quality to an integer ( it is already capped by MAX_QUAL ) - qualHistogram[empiricalQual] += (long) datum.getNumObservations(); // add the number of observations for every key - } - empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities - quantizeQualityScores(quantizationLevels); - - this.quantizationLevels = quantizationLevels; - } - - - public void quantizeQualityScores(int nLevels) { - QualQuantizer quantizer = new QualQuantizer(empiricalQualCounts, nLevels, QualityUtils.MIN_USABLE_Q_SCORE); // quantize the qualities to the desired number of levels - quantizedQuals = quantizer.getOriginalToQuantizedMap(); // map with the original to quantized qual map (using the standard number of levels in the RAC) - } - - public void noQuantization() { - this.quantizationLevels = QualityUtils.MAX_SAM_QUAL_SCORE; - for (int i = 0; i < this.quantizationLevels; i++) - quantizedQuals.set(i, (byte) i); - } - - public List getQuantizedQuals() { - return quantizedQuals; - } - - public int getQuantizationLevels() { - return quantizationLevels; - } - - public GATKReportTable generateReportTable(boolean sortByCols) { - GATKReportTable quantizedTable; - if(sortByCols) { - quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3, GATKReportTable.TableSortingWay.SORT_BY_COLUMN); - } else { - quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3); - } - quantizedTable.addColumn(RecalUtils.QUALITY_SCORE_COLUMN_NAME); - quantizedTable.addColumn(RecalUtils.QUANTIZED_COUNT_COLUMN_NAME); - quantizedTable.addColumn(RecalUtils.QUANTIZED_VALUE_COLUMN_NAME); - - for (int qual = 0; qual <= QualityUtils.MAX_SAM_QUAL_SCORE; qual++) { - quantizedTable.set(qual, RecalUtils.QUALITY_SCORE_COLUMN_NAME, qual); - quantizedTable.set(qual, RecalUtils.QUANTIZED_COUNT_COLUMN_NAME, empiricalQualCounts.get(qual)); - quantizedTable.set(qual, RecalUtils.QUANTIZED_VALUE_COLUMN_NAME, quantizedQuals.get(qual)); - } - return quantizedTable; - } - - private static int calculateQuantizationLevels(List quantizedQuals) { - byte lastByte = -1; - int quantizationLevels = 0; - for (byte q : quantizedQuals) { - if (q != lastByte) { - quantizationLevels++; - lastByte = q; - } - } - return quantizationLevels; - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/ReadCovariates.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/ReadCovariates.java deleted file mode 100644 index e2aed8b48..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/ReadCovariates.java +++ /dev/null @@ -1,175 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.LRUCache; - -/** - * The object temporarily held by a read that describes all of it's covariates. - * - * In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap - * - * @author Mauricio Carneiro - * @since 2/8/12 - */ -public class ReadCovariates { - private final static Logger logger = Logger.getLogger(ReadCovariates.class); - - /** - * How big should we let the LRU cache grow - */ - private static final int LRU_CACHE_SIZE = 500; - - /** - * Use an LRU cache to keep cache of keys (int[][][]) arrays for each read length we've seen. - * The cache allows us to avoid the expense of recreating these arrays for every read. The LRU - * keeps the total number of cached arrays to less than LRU_CACHE_SIZE. - * - * This is a thread local variable, so the total memory required may grow to N_THREADS x LRU_CACHE_SIZE - */ - private final static ThreadLocal> keysCache = new ThreadLocal>() { - @Override protected LRUCache initialValue() { - return new LRUCache(LRU_CACHE_SIZE); - } - }; - - /** - * The keys cache is only valid for a single covariate count. Normally this will remain constant for the analysis. - * If running multiple analyses (or the unit test suite), it's necessary to clear the cache. - */ - public static void clearKeysCache() { - keysCache.remove(); - } - - /** - * Our keys, indexed by event type x read length x covariate - */ - private final int[][][] keys; - - /** - * The index of the current covariate, used by addCovariate - */ - private int currentCovariateIndex = 0; - - public ReadCovariates(final int readLength, final int numberOfCovariates) { - final LRUCache cache = keysCache.get(); - final int[][][] cachedKeys = cache.get(readLength); - if ( cachedKeys == null ) { - // There's no cached value for read length so we need to create a new int[][][] array - if ( logger.isDebugEnabled() ) logger.debug("Keys cache miss for length " + readLength + " cache size " + cache.size()); - keys = new int[EventType.values().length][readLength][numberOfCovariates]; - cache.put(readLength, keys); - } else { - keys = cachedKeys; - } - } - - public void setCovariateIndex(final int index) { - currentCovariateIndex = index; - } - - /** - * Update the keys for mismatch, insertion, and deletion for the current covariate at read offset - * - * NOTE: no checks are performed on the number of covariates, for performance reasons. If the count increases - * after the keysCache has been accessed, this method will throw an ArrayIndexOutOfBoundsException. This currently - * only occurs in the testing harness, and we don't anticipate that it will become a part of normal runs. - * - * @param mismatch the mismatch key value - * @param insertion the insertion key value - * @param deletion the deletion key value - * @param readOffset the read offset, must be >= 0 and <= the read length used to create this ReadCovariates - */ - public void addCovariate(final int mismatch, final int insertion, final int deletion, final int readOffset) { - keys[EventType.BASE_SUBSTITUTION.ordinal()][readOffset][currentCovariateIndex] = mismatch; - keys[EventType.BASE_INSERTION.ordinal()][readOffset][currentCovariateIndex] = insertion; - keys[EventType.BASE_DELETION.ordinal()][readOffset][currentCovariateIndex] = deletion; - } - - /** - * Get the keys for all covariates at read position for error model - * - * @param readPosition - * @param errorModel - * @return - */ - public int[] getKeySet(final int readPosition, final EventType errorModel) { - return keys[errorModel.ordinal()][readPosition]; - } - - public int[][] getKeySet(final EventType errorModel) { - return keys[errorModel.ordinal()]; - } - - // ---------------------------------------------------------------------- - // - // routines for testing - // - // ---------------------------------------------------------------------- - - protected int[][] getMismatchesKeySet() { return getKeySet(EventType.BASE_SUBSTITUTION); } - protected int[][] getInsertionsKeySet() { return getKeySet(EventType.BASE_INSERTION); } - protected int[][] getDeletionsKeySet() { return getKeySet(EventType.BASE_DELETION); } - - protected int[] getMismatchesKeySet(final int readPosition) { - return getKeySet(readPosition, EventType.BASE_SUBSTITUTION); - } - - protected int[] getInsertionsKeySet(final int readPosition) { - return getKeySet(readPosition, EventType.BASE_INSERTION); - } - - protected int[] getDeletionsKeySet(final int readPosition) { - return getKeySet(readPosition, EventType.BASE_DELETION); - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalDatum.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalDatum.java deleted file mode 100644 index 6cfc435f7..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalDatum.java +++ /dev/null @@ -1,434 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; -import com.google.java.contract.Requires; -import htsjdk.samtools.SAMUtils; -import org.apache.commons.math.optimization.fitting.GaussianFunction; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.QualityUtils; - - -/** - * An individual piece of recalibration data. Each bin counts up the number of observations and the number - * of reference mismatches seen for that combination of covariates. - * - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Nov 3, 2009 - */ -@Invariant({ - "estimatedQReported >= 0.0", - "! Double.isNaN(estimatedQReported)", - "! Double.isInfinite(estimatedQReported)", - "empiricalQuality >= 0.0 || empiricalQuality == UNINITIALIZED", - "! Double.isNaN(empiricalQuality)", - "! Double.isInfinite(empiricalQuality)", - "numObservations >= 0", - "numMismatches >= 0", - "numMismatches <= numObservations" -}) -public class RecalDatum { - public final static byte MAX_RECALIBRATED_Q_SCORE = SAMUtils.MAX_PHRED_SCORE; - private static final double UNINITIALIZED = -1.0; - - /** - * estimated reported quality score based on combined data's individual q-reporteds and number of observations - */ - private double estimatedQReported; - - /** - * the empirical quality for datums that have been collapsed together (by read group and reported quality, for example) - */ - private double empiricalQuality; - - /** - * number of bases seen in total - */ - private long numObservations; - - /** - * number of bases seen that didn't match the reference - */ - private double numMismatches; - - /** - * used when calculating empirical qualities to avoid division by zero - */ - private static final int SMOOTHING_CONSTANT = 1; - - //--------------------------------------------------------------------------------------------------------------- - // - // constructors - // - //--------------------------------------------------------------------------------------------------------------- - - /** - * Create a new RecalDatum with given observation and mismatch counts, and an reported quality - * - * @param _numObservations observations - * @param _numMismatches mismatches - * @param reportedQuality Qreported - */ - public RecalDatum(final long _numObservations, final double _numMismatches, final byte reportedQuality) { - if ( _numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0"); - if ( _numMismatches < 0.0 ) throw new IllegalArgumentException("numMismatches < 0"); - if ( reportedQuality < 0 ) throw new IllegalArgumentException("reportedQuality < 0"); - - numObservations = _numObservations; - numMismatches = _numMismatches; - estimatedQReported = reportedQuality; - empiricalQuality = UNINITIALIZED; - } - - /** - * Copy copy into this recal datum, overwriting all of this objects data - * @param copy RecalDatum to copy - */ - public RecalDatum(final RecalDatum copy) { - this.numObservations = copy.getNumObservations(); - this.numMismatches = copy.getNumMismatches(); - this.estimatedQReported = copy.estimatedQReported; - this.empiricalQuality = copy.empiricalQuality; - } - - /** - * Add in all of the data from other into this object, updating the reported quality from the expected - * error rate implied by the two reported qualities - * - * @param other RecalDatum to combine - */ - public synchronized void combine(final RecalDatum other) { - final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors(); - increment(other.getNumObservations(), other.getNumMismatches()); - estimatedQReported = -10 * Math.log10(sumErrors / getNumObservations()); - empiricalQuality = UNINITIALIZED; - } - - public synchronized void setEstimatedQReported(final double estimatedQReported) { - if ( estimatedQReported < 0 ) throw new IllegalArgumentException("estimatedQReported < 0"); - if ( Double.isInfinite(estimatedQReported) ) throw new IllegalArgumentException("estimatedQReported is infinite"); - if ( Double.isNaN(estimatedQReported) ) throw new IllegalArgumentException("estimatedQReported is NaN"); - - this.estimatedQReported = estimatedQReported; - empiricalQuality = UNINITIALIZED; - } - - public final double getEstimatedQReported() { - return estimatedQReported; - } - public final byte getEstimatedQReportedAsByte() { - return (byte)(int)(Math.round(getEstimatedQReported())); - } - - //--------------------------------------------------------------------------------------------------------------- - // - // Empirical quality score -- derived from the num mismatches and observations - // - //--------------------------------------------------------------------------------------------------------------- - - /** - * Returns the error rate (in real space) of this interval, or 0 if there are no observations - * @return the empirical error rate ~= N errors / N obs - */ - @Ensures({"result >= 0.0"}) - public double getEmpiricalErrorRate() { - if ( numObservations == 0 ) - return 0.0; - else { - // cache the value so we don't call log over and over again - final double doubleMismatches = numMismatches + SMOOTHING_CONSTANT; - // smoothing is one error and one non-error observation, for example - final double doubleObservations = numObservations + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT; - return doubleMismatches / doubleObservations; - } - } - - public synchronized void setEmpiricalQuality(final double empiricalQuality) { - if ( empiricalQuality < 0 ) throw new IllegalArgumentException("empiricalQuality < 0"); - if ( Double.isInfinite(empiricalQuality) ) throw new IllegalArgumentException("empiricalQuality is infinite"); - if ( Double.isNaN(empiricalQuality) ) throw new IllegalArgumentException("empiricalQuality is NaN"); - - this.empiricalQuality = empiricalQuality; - } - - public final double getEmpiricalQuality() { - return getEmpiricalQuality(getEstimatedQReported()); - } - - public synchronized final double getEmpiricalQuality(final double conditionalPrior) { - if (empiricalQuality == UNINITIALIZED) { - calcEmpiricalQuality(conditionalPrior); - } - return empiricalQuality; - } - - public final byte getEmpiricalQualityAsByte() { - return (byte)(Math.round(getEmpiricalQuality())); - } - - //--------------------------------------------------------------------------------------------------------------- - // - // toString methods - // - //--------------------------------------------------------------------------------------------------------------- - - @Override - public String toString() { - return String.format("%d,%.2f,%.2f", getNumObservations(), getNumMismatches(), getEmpiricalQuality()); - } - - public String stringForCSV() { - return String.format("%s,%.2f,%.2f", toString(), getEstimatedQReported(), getEmpiricalQuality() - getEstimatedQReported()); - } - - //--------------------------------------------------------------------------------------------------------------- - // - // increment methods - // - //--------------------------------------------------------------------------------------------------------------- - - public final long getNumObservations() { - return numObservations; - } - - public final synchronized void setNumObservations(final long numObservations) { - if ( numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0"); - this.numObservations = numObservations; - empiricalQuality = UNINITIALIZED; - } - - public final double getNumMismatches() { - return numMismatches; - } - - @Requires({"numMismatches >= 0"}) - public final synchronized void setNumMismatches(final double numMismatches) { - if ( numMismatches < 0 ) throw new IllegalArgumentException("numMismatches < 0"); - this.numMismatches = numMismatches; - empiricalQuality = UNINITIALIZED; - } - - @Requires({"by >= 0"}) - public final synchronized void incrementNumObservations(final long by) { - numObservations += by; - empiricalQuality = UNINITIALIZED; - } - - @Requires({"by >= 0"}) - public final synchronized void incrementNumMismatches(final double by) { - numMismatches += by; - empiricalQuality = UNINITIALIZED; - } - - @Requires({"incObservations >= 0", "incMismatches >= 0"}) - @Ensures({"numObservations == old(numObservations) + incObservations", "numMismatches == old(numMismatches) + incMismatches"}) - public final synchronized void increment(final long incObservations, final double incMismatches) { - numObservations += incObservations; - numMismatches += incMismatches; - empiricalQuality = UNINITIALIZED; - } - - @Ensures({"numObservations == old(numObservations) + 1", "numMismatches >= old(numMismatches)"}) - public final synchronized void increment(final boolean isError) { - increment(1, isError ? 1.0 : 0.0); - } - - // ------------------------------------------------------------------------------------- - // - // Private implementation helper functions - // - // ------------------------------------------------------------------------------------- - - /** - * calculate the expected number of errors given the estimated Q reported and the number of observations - * in this datum. - * - * @return a positive (potentially fractional) estimate of the number of errors - */ - @Ensures("result >= 0.0") - private double calcExpectedErrors() { - return getNumObservations() * QualityUtils.qualToErrorProb(estimatedQReported); - } - - /** - * Calculate and cache the empirical quality score from mismatches and observations (expensive operation) - */ - @Requires("empiricalQuality == UNINITIALIZED") - @Ensures("empiricalQuality != UNINITIALIZED") - private synchronized void calcEmpiricalQuality(final double conditionalPrior) { - - // smoothing is one error and one non-error observation - final long mismatches = (long)(getNumMismatches() + 0.5) + SMOOTHING_CONSTANT; - final long observations = getNumObservations() + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT; - - final double empiricalQual = RecalDatum.bayesianEstimateOfEmpiricalQuality(observations, mismatches, conditionalPrior); - - // This is the old and busted point estimate approach: - //final double empiricalQual = -10 * Math.log10(getEmpiricalErrorRate()); - - empiricalQuality = Math.min(empiricalQual, (double) MAX_RECALIBRATED_Q_SCORE); - } - - //static final boolean DEBUG = false; - static private final double RESOLUTION_BINS_PER_QUAL = 1.0; - - static public double bayesianEstimateOfEmpiricalQuality(final long nObservations, final long nErrors, final double QReported) { - - final int numBins = (QualityUtils.MAX_REASONABLE_Q_SCORE + 1) * (int)RESOLUTION_BINS_PER_QUAL; - - final double[] log10Posteriors = new double[numBins]; - - for ( int bin = 0; bin < numBins; bin++ ) { - - final double QEmpOfBin = bin / RESOLUTION_BINS_PER_QUAL; - - log10Posteriors[bin] = log10QempPrior(QEmpOfBin, QReported) + log10QempLikelihood(QEmpOfBin, nObservations, nErrors); - - //if ( DEBUG ) - // System.out.println(String.format("bin = %d, Qreported = %f, nObservations = %f, nErrors = %f, posteriors = %f", bin, QReported, nObservations, nErrors, log10Posteriors[bin])); - } - - //if ( DEBUG ) - // System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f", QReported, nObservations, nErrors)); - - final double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10Posteriors); - final int MLEbin = MathUtils.maxElementIndex(normalizedPosteriors); - - final double Qemp = MLEbin / RESOLUTION_BINS_PER_QUAL; - return Qemp; - } - - /** - * Quals above this value should be capped down to this value (because they are too high) - * in the base quality score recalibrator - */ - public final static byte MAX_GATK_USABLE_Q_SCORE = 40; - static private final double[] log10QempPriorCache = new double[MAX_GATK_USABLE_Q_SCORE + 1]; - static { - // f(x) = a + b*exp(-((x - c)^2 / (2*d^2))) - // Note that b is the height of the curve's peak, c is the position of the center of the peak, and d controls the width of the "bell". - final double GF_a = 0.0; - final double GF_b = 0.9; - final double GF_c = 0.0; - final double GF_d = 0.5; // with these parameters, deltas can shift at most ~20 Q points - - final GaussianFunction gaussian = new GaussianFunction(GF_a, GF_b, GF_c, GF_d); - for ( int i = 0; i <= MAX_GATK_USABLE_Q_SCORE; i++ ) { - double log10Prior = Math.log10(gaussian.value((double) i)); - if ( Double.isInfinite(log10Prior) ) - log10Prior = -Double.MAX_VALUE; - log10QempPriorCache[i] = log10Prior; - } - } - - static protected double log10QempPrior(final double Qempirical, final double Qreported) { - final int difference = Math.min(Math.abs((int) (Qempirical - Qreported)), MAX_GATK_USABLE_Q_SCORE); - //if ( DEBUG ) - // System.out.println(String.format("Qemp = %f, log10Priors = %f", Qempirical, log10QempPriorCache[difference])); - return log10QempPriorCache[difference]; - } - - static private final long MAX_NUMBER_OF_OBSERVATIONS = Integer.MAX_VALUE - 1; - - static protected double log10QempLikelihood(final double Qempirical, long nObservations, long nErrors) { - if ( nObservations == 0 ) - return 0.0; - - // the binomial code requires ints as input (because it does caching). This should theoretically be fine because - // there is plenty of precision in 2^31 observations, but we need to make sure that we don't have overflow - // before casting down to an int. - if ( nObservations > MAX_NUMBER_OF_OBSERVATIONS ) { - // we need to decrease nErrors by the same fraction that we are decreasing nObservations - final double fraction = (double)MAX_NUMBER_OF_OBSERVATIONS / (double)nObservations; - nErrors = Math.round((double)nErrors * fraction); - nObservations = MAX_NUMBER_OF_OBSERVATIONS; - } - - // this is just a straight binomial PDF - double log10Prob = MathUtils.log10BinomialProbability((int)nObservations, (int)nErrors, QualityUtils.qualToErrorProbLog10(Qempirical)); - if ( Double.isInfinite(log10Prob) || Double.isNaN(log10Prob) ) - log10Prob = -Double.MAX_VALUE; - - //if ( DEBUG ) - // System.out.println(String.format("Qemp = %f, log10Likelihood = %f", Qempirical, log10Prob)); - - return log10Prob; - } -} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalDatumNode.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalDatumNode.java deleted file mode 100644 index f3759cdb7..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalDatumNode.java +++ /dev/null @@ -1,582 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.commons.math.MathException; -import org.apache.commons.math.stat.inference.ChiSquareTestImpl; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.util.Collection; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.Set; - -/** - * A tree of recal datum, where each contains a set of sub datum representing sub-states of the higher level one - * - * @author Mark DePristo - * @since 07/27/12 - */ -public class RecalDatumNode { - private final static double SMALLEST_CHI2_PVALUE = 1e-300; - protected static final Logger logger = Logger.getLogger(RecalDatumNode.class); - - /** - * fixedPenalty is this value if it's considered fixed - */ - private final static double UNINITIALIZED = Double.NEGATIVE_INFINITY; - - private final T recalDatum; - private double fixedPenalty = UNINITIALIZED; - private final Set> subnodes; - - @Requires({"recalDatum != null"}) - public RecalDatumNode(final T recalDatum) { - this(recalDatum, new HashSet>()); - } - - @Override - public String toString() { - return recalDatum.toString(); - } - - @Requires({"recalDatum != null", "subnodes != null"}) - public RecalDatumNode(final T recalDatum, final Set> subnodes) { - this(recalDatum, UNINITIALIZED, subnodes); - } - - @Requires({"recalDatum != null"}) - protected RecalDatumNode(final T recalDatum, final double fixedPenalty) { - this(recalDatum, fixedPenalty, new HashSet>()); - } - - @Requires({"recalDatum != null", "subnodes != null"}) - protected RecalDatumNode(final T recalDatum, final double fixedPenalty, final Set> subnodes) { - this.recalDatum = recalDatum; - this.fixedPenalty = fixedPenalty; - this.subnodes = new HashSet>(subnodes); - } - - /** - * Get the recal data associated with this node - * @return - */ - @Ensures("result != null") - public T getRecalDatum() { - return recalDatum; - } - - /** - * The set of all subnodes of this tree. May be modified. - * @return - */ - @Ensures("result != null") - public Set> getSubnodes() { - return subnodes; - } - - /** - * Return the fixed penalty, if set, or else the the calculated penalty for this node - * @return - */ - public double getPenalty() { - if ( fixedPenalty != UNINITIALIZED ) - return fixedPenalty; - else - return calcPenalty(); - } - - /** - * Set the fixed penalty for this node to a fresh calculation from calcPenalty - * - * This is important in the case where you want to compute the penalty from a full - * tree and then chop the tree up afterwards while considering the previous penalties. - * If you don't call this function then manipulating the tree may result in the - * penalty functions changing with changes in the tree. - * - * @param doEntireTree recurse into all subnodes? - * @return the fixed penalty for this node - */ - public double calcAndSetFixedPenalty(final boolean doEntireTree) { - fixedPenalty = calcPenalty(); - if ( doEntireTree ) - for ( final RecalDatumNode sub : subnodes ) - sub.calcAndSetFixedPenalty(doEntireTree); - return fixedPenalty; - } - - /** - * Add node to the set of subnodes of this node - * @param sub - */ - @Requires("sub != null") - public void addSubnode(final RecalDatumNode sub) { - subnodes.add(sub); - } - - /** - * Is this a leaf node (i.e., has no subnodes)? - * @return - */ - public boolean isLeaf() { - return subnodes.isEmpty(); - } - - /** - * Is this node immediately above only leaf nodes? - * - * @return - */ - public boolean isAboveOnlyLeaves() { - for ( final RecalDatumNode sub : subnodes ) - if ( ! sub.isLeaf() ) - return false; - return true; - } - - /** - * What's the immediate number of subnodes from this node? - * @return - */ - @Ensures("result >= 0") - public int getNumSubnodes() { - return subnodes.size(); - } - - /** - * Total penalty is the sum of leaf node penalties - * - * This algorithm assumes that penalties have been fixed before pruning, as leaf nodes by - * definition have 0 penalty unless they represent a pruned tree with underlying -- but now - * pruned -- subtrees - * - * @return - */ - public double totalPenalty() { - if ( isLeaf() ) - return getPenalty(); - else { - double sum = 0.0; - for ( final RecalDatumNode sub : subnodes ) - sum += sub.totalPenalty(); - return sum; - } - } - - /** - * The maximum penalty among all nodes - * @return - */ - public double maxPenalty(final boolean leafOnly) { - double max = ! leafOnly || isLeaf() ? getPenalty() : Double.MIN_VALUE; - for ( final RecalDatumNode sub : subnodes ) - max = Math.max(max, sub.maxPenalty(leafOnly)); - return max; - } - - /** - * The minimum penalty among all nodes - * @return - */ - public double minPenalty(final boolean leafOnly) { - double min = ! leafOnly || isLeaf() ? getPenalty() : Double.MAX_VALUE; - for ( final RecalDatumNode sub : subnodes ) - min = Math.min(min, sub.minPenalty(leafOnly)); - return min; - } - - /** - * What's the longest branch from this node to any leaf? - * @return - */ - public int maxDepth() { - int subMax = 0; - for ( final RecalDatumNode sub : subnodes ) - subMax = Math.max(subMax, sub.maxDepth()); - return subMax + 1; - } - - /** - * What's the shortest branch from this node to any leaf? Includes this node - * @return - */ - @Ensures("result > 0") - public int minDepth() { - if ( isLeaf() ) - return 1; - else { - int subMin = Integer.MAX_VALUE; - for ( final RecalDatumNode sub : subnodes ) - subMin = Math.min(subMin, sub.minDepth()); - return subMin + 1; - } - } - - /** - * Return the number of nodes, including this one, reachable from this node - * @return - */ - @Ensures("result > 0") - public int size() { - int size = 1; - for ( final RecalDatumNode sub : subnodes ) - size += sub.size(); - return size; - } - - /** - * Count the number of leaf nodes reachable from this node - * - * @return - */ - @Ensures("result >= 0") - public int numLeaves() { - if ( isLeaf() ) - return 1; - else { - int size = 0; - for ( final RecalDatumNode sub : subnodes ) - size += sub.numLeaves(); - return size; - } - } - - /** - * Calculate the phred-scaled p-value for a chi^2 test for independent among subnodes of this node. - * - * The chi^2 value indicates the degree of independence of the implied error rates among the - * immediate subnodes - * - * @return the phred-scaled p-value for chi2 penalty, or 0.0 if it cannot be calculated - */ - private double calcPenalty() { - if ( isLeaf() || freeToMerge() ) - return 0.0; - else if ( subnodes.size() == 1 ) - // only one value, so its free to merge away - return 0.0; - else { - final long[][] counts = new long[subnodes.size()][2]; - - int i = 0; - for ( final RecalDatumNode subnode : subnodes ) { - // use the yates correction to help avoid all zeros => NaN - counts[i][0] = Math.round(subnode.getRecalDatum().getNumMismatches()) + 1L; - counts[i][1] = subnode.getRecalDatum().getNumObservations() + 2L; - i++; - } - - try { - final double chi2PValue = new ChiSquareTestImpl().chiSquareTest(counts); - final double penalty = -10.0 * Math.log10(Math.max(chi2PValue, SMALLEST_CHI2_PVALUE)); - - // make sure things are reasonable and fail early if not - if (Double.isInfinite(penalty) || Double.isNaN(penalty)) - throw new ReviewedGATKException("chi2 value is " + chi2PValue + " at " + getRecalDatum()); - - return penalty; - } catch ( MathException e ) { - throw new ReviewedGATKException("Failed in calculating chi2 value", e); - } - } - } - - /** - * Is this node free to merge because its rounded Q score is the same as all nodes below - * @return - */ - private boolean freeToMerge() { - if ( isLeaf() ) // leaves are free to merge - return true; - else { - final byte myQual = getRecalDatum().getEmpiricalQualityAsByte(); - for ( final RecalDatumNode sub : subnodes ) - if ( sub.getRecalDatum().getEmpiricalQualityAsByte() != myQual ) - return false; - return true; - } - } - - /** - * Calculate the penalty of this interval, given the overall error rate for the interval - * - * If the globalErrorRate is e, this value is: - * - * sum_i |log10(e_i) - log10(e)| * nObservations_i - * - * each the index i applies to all leaves of the tree accessible from this interval - * (found recursively from subnodes as necessary) - * - * @param globalErrorRate overall error rate in real space against which we calculate the penalty - * @return the cost of approximating the bins in this interval with the globalErrorRate - */ - @Requires("globalErrorRate >= 0.0") - @Ensures("result >= 0.0") - private double calcPenaltyLog10(final double globalErrorRate) { - if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty - return 0.0; - - if ( isLeaf() ) { - // this is leave node - return (Math.abs(Math.log10(recalDatum.getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * (double)recalDatum.getNumObservations(); - // TODO -- how we can generalize this calculation? -// if ( this.qEnd <= minInterestingQual ) -// // It's free to merge up quality scores below the smallest interesting one -// return 0; -// else { -// return (Math.abs(Math.log10(getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * getNumObservations(); -// } - } else { - double sum = 0; - for ( final RecalDatumNode hrd : subnodes) - sum += hrd.calcPenaltyLog10(globalErrorRate); - return sum; - } - } - - /** - * Return a freshly allocated tree prunes to have no more than maxDepth from the root to any leaf - * - * @param maxDepth - * @return - */ - public RecalDatumNode pruneToDepth(final int maxDepth) { - if ( maxDepth < 1 ) - throw new IllegalArgumentException("maxDepth < 1"); - else { - final Set> subPruned = new HashSet>(getNumSubnodes()); - if ( maxDepth > 1 ) - for ( final RecalDatumNode sub : subnodes ) - subPruned.add(sub.pruneToDepth(maxDepth - 1)); - return new RecalDatumNode(getRecalDatum(), fixedPenalty, subPruned); - } - } - - /** - * Return a freshly allocated tree with to no more than maxElements in order of penalty - * - * Note that nodes must have fixed penalties to this algorithm will fail. - * - * @param maxElements - * @return - */ - public RecalDatumNode pruneByPenalty(final int maxElements) { - RecalDatumNode root = this; - - while ( root.size() > maxElements ) { - // remove the lowest penalty element, and continue - root = root.removeLowestPenaltyNode(); - } - - // our size is below the target, so we are good, return - return root; - } - - /** - * Return a freshly allocated tree where all mergable nodes with < maxPenalty are merged - * - * Note that nodes must have fixed penalties to this algorithm will fail. - * - * @param maxPenaltyIn the maximum penalty we are allowed to incur for a merge - * @param applyBonferroniCorrection if true, we will adjust penalty by the phred-scaled bonferroni correction - * for the size of the initial tree. That is, if there are 10 nodes in the - * tree and maxPenalty is 20 we will actually enforce 10^-2 / 10 = 10^-3 = 30 - * penalty for multiple testing - * @return - */ - public RecalDatumNode pruneToNoMoreThanPenalty(final double maxPenaltyIn, final boolean applyBonferroniCorrection) { - RecalDatumNode root = this; - - final double bonferroniCorrection = 10 * Math.log10(this.size()); - final double maxPenalty = applyBonferroniCorrection ? maxPenaltyIn + bonferroniCorrection : maxPenaltyIn; - - if ( applyBonferroniCorrection ) - logger.info(String.format("Applying Bonferroni correction for %d nodes = %.2f to initial penalty %.2f for total " + - "corrected max penalty of %.2f", this.size(), bonferroniCorrection, maxPenaltyIn, maxPenalty)); - - while ( true ) { - final Pair, Double> minPenaltyNode = root.getMinPenaltyAboveLeafNode(); - - if ( minPenaltyNode == null || minPenaltyNode.getSecond() > maxPenalty ) { - // nothing to merge, or the best candidate is above our max allowed - if ( minPenaltyNode == null ) { - if ( logger.isDebugEnabled() ) logger.debug("Stopping because no candidates could be found"); - } else { - if ( logger.isDebugEnabled() ) logger.debug("Stopping because node " + minPenaltyNode.getFirst() + " has penalty " + minPenaltyNode.getSecond() + " > max " + maxPenalty); - } - break; - } else { - // remove the lowest penalty element, and continue - if ( logger.isDebugEnabled() ) logger.debug("Removing node " + minPenaltyNode.getFirst() + " with penalty " + minPenaltyNode.getSecond()); - root = root.removeLowestPenaltyNode(); - } - } - - // no more candidates exist with penalty < maxPenalty - return root; - } - - - /** - * Find the lowest penalty above leaf node in the tree, and return a tree without it - * - * Note this excludes the current (root) node - * - * @return - */ - private RecalDatumNode removeLowestPenaltyNode() { - final Pair, Double> nodeToRemove = getMinPenaltyAboveLeafNode(); - if ( logger.isDebugEnabled() ) - logger.debug("Removing " + nodeToRemove.getFirst() + " with penalty " + nodeToRemove.getSecond()); - - final Pair, Boolean> result = removeNode(nodeToRemove.getFirst()); - - if ( ! result.getSecond() ) - throw new IllegalStateException("Never removed any node!"); - - final RecalDatumNode oneRemoved = result.getFirst(); - if ( oneRemoved == null ) - throw new IllegalStateException("Removed our root node, wow, didn't expect that"); - return oneRemoved; - } - - /** - * Finds in the tree the node with the lowest penalty whose subnodes are all leaves - * - * @return the node and its penalty, or null if no such node exists - */ - private Pair, Double> getMinPenaltyAboveLeafNode() { - if ( isLeaf() ) - // not allowed to remove leafs directly - return null; - if ( isAboveOnlyLeaves() ) - // we only consider removing nodes above all leaves - return new Pair, Double>(this, getPenalty()); - else { - // just recurse, taking the result with the min penalty of all subnodes - Pair, Double> minNode = null; - for ( final RecalDatumNode sub : subnodes ) { - final Pair, Double> subFind = sub.getMinPenaltyAboveLeafNode(); - if ( subFind != null && (minNode == null || subFind.getSecond() < minNode.getSecond()) ) { - minNode = subFind; - } - } - return minNode; - } - } - - /** - * Return a freshly allocated tree without the node nodeToRemove - * - * @param nodeToRemove - * @return - */ - private Pair, Boolean> removeNode(final RecalDatumNode nodeToRemove) { - if ( this == nodeToRemove ) { - if ( isLeaf() ) - throw new IllegalStateException("Trying to remove a leaf node from the tree! " + this + " " + nodeToRemove); - // node is the thing we are going to remove, but without any subnodes - final RecalDatumNode node = new RecalDatumNode(getRecalDatum(), fixedPenalty); - return new Pair, Boolean>(node, true); - } else { - // did we remove something in a sub branch? - boolean removedSomething = false; - - // our sub nodes with the penalty node removed - final Set> sub = new HashSet>(getNumSubnodes()); - - for ( final RecalDatumNode sub1 : subnodes ) { - if ( removedSomething ) { - // already removed something, just add sub1 back to sub - sub.add(sub1); - } else { - // haven't removed anything yet, so try - final Pair, Boolean> maybeRemoved = sub1.removeNode(nodeToRemove); - removedSomething = maybeRemoved.getSecond(); - sub.add(maybeRemoved.getFirst()); - } - } - - final RecalDatumNode node = new RecalDatumNode(getRecalDatum(), fixedPenalty, sub); - return new Pair, Boolean>(node, removedSomething); - } - } - - /** - * Return a collection of all of the data in the leaf nodes of this tree - * - * @return - */ - public Collection getAllLeaves() { - final LinkedList list = new LinkedList(); - getAllLeavesRec(list); - return list; - } - - /** - * Helpful recursive function for getAllLeaves() - * - * @param list the destination for the list of leaves - */ - private void getAllLeavesRec(final LinkedList list) { - if ( isLeaf() ) - list.add(getRecalDatum()); - else { - for ( final RecalDatumNode sub : subnodes ) - sub.getAllLeavesRec(list); - } - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalUtils.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalUtils.java deleted file mode 100644 index 9c5739466..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalUtils.java +++ /dev/null @@ -1,1097 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.report.GATKReport; -import org.broadinstitute.gatk.engine.report.GATKReportTable; -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.classloader.JVMUtils; -import org.broadinstitute.gatk.utils.recalibration.covariates.*; -import org.broadinstitute.gatk.utils.BaseUtils; -import org.broadinstitute.gatk.utils.R.RScriptExecutor; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.classloader.PluginManager; -import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.exceptions.DynamicClassResolutionException; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.io.Resource; -import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.sam.ReadUtils; - -import java.io.*; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Nov 6, 2009 - * - * This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions. - * It also has static methods that are used to perform the various solid recalibration modes that attempt to correct the reference bias. - * This class holds the parsing methods that are shared between BaseRecalibrator and PrintReads. - */ - -public class RecalUtils { - public final static String ARGUMENT_REPORT_TABLE_TITLE = "Arguments"; - public final static String QUANTIZED_REPORT_TABLE_TITLE = "Quantized"; - public final static String READGROUP_REPORT_TABLE_TITLE = "RecalTable0"; - public final static String QUALITY_SCORE_REPORT_TABLE_TITLE = "RecalTable1"; - public final static String ALL_COVARIATES_REPORT_TABLE_TITLE = "RecalTable2"; - - public final static String ARGUMENT_COLUMN_NAME = "Argument"; - public final static String ARGUMENT_VALUE_COLUMN_NAME = "Value"; - public final static String QUANTIZED_VALUE_COLUMN_NAME = "QuantizedScore"; - public static final String QUANTIZED_COUNT_COLUMN_NAME = "Count"; - public final static String READGROUP_COLUMN_NAME = "ReadGroup"; - public final static String EVENT_TYPE_COLUMN_NAME = "EventType"; - public final static String EMPIRICAL_QUALITY_COLUMN_NAME = "EmpiricalQuality"; - public final static String ESTIMATED_Q_REPORTED_COLUMN_NAME = "EstimatedQReported"; - public final static String QUALITY_SCORE_COLUMN_NAME = "QualityScore"; - public final static String COVARIATE_VALUE_COLUMN_NAME = "CovariateValue"; - public final static String COVARIATE_NAME_COLUMN_NAME = "CovariateName"; - public final static String NUMBER_OBSERVATIONS_COLUMN_NAME = "Observations"; - public final static String NUMBER_ERRORS_COLUMN_NAME = "Errors"; - - private final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams - private final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color - private static boolean warnUserNullPlatform = false; - - private static final String SCRIPT_FILE = "BQSR.R"; - - private static final Pair covariateValue = new Pair(RecalUtils.COVARIATE_VALUE_COLUMN_NAME, "%s"); - private static final Pair covariateName = new Pair(RecalUtils.COVARIATE_NAME_COLUMN_NAME, "%s"); - private static final Pair eventType = new Pair(RecalUtils.EVENT_TYPE_COLUMN_NAME, "%s"); - private static final Pair empiricalQuality = new Pair(RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, "%.4f"); - private static final Pair estimatedQReported = new Pair(RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.4f"); - private static final Pair nObservations = new Pair(RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d"); - private static final Pair nErrors = new Pair(RecalUtils.NUMBER_ERRORS_COLUMN_NAME, "%.2f"); - - /** - * Generates two lists : required covariates and optional covariates based on the user's requests. - * - * Performs the following tasks in order: - * 1. Adds all requierd covariates in order - * 2. Check if the user asked to use the standard covariates and adds them all if that's the case - * 3. Adds all covariates requested by the user that were not already added by the two previous steps - * - * @param argumentCollection the argument collection object for the recalibration walker - * @return a pair of ordered lists : required covariates (first) and optional covariates (second) - */ - public static Pair, ArrayList> initializeCovariates(RecalibrationArgumentCollection argumentCollection) { - final List> covariateClasses = new PluginManager(Covariate.class).getPlugins(); - final List> requiredClasses = new PluginManager(RequiredCovariate.class).getPlugins(); - final List> standardClasses = new PluginManager(StandardCovariate.class).getPlugins(); - - final ArrayList requiredCovariates = addRequiredCovariatesToList(requiredClasses); // add the required covariates - ArrayList optionalCovariates = new ArrayList(); - if (!argumentCollection.DO_NOT_USE_STANDARD_COVARIATES) - optionalCovariates = addStandardCovariatesToList(standardClasses); // add the standard covariates if -standard was specified by the user - - // parse the -cov arguments that were provided, skipping over the ones already specified - if (argumentCollection.COVARIATES != null) { - for (String requestedCovariateString : argumentCollection.COVARIATES) { - // help the transition from BQSR v1 to BQSR v2 - if ( requestedCovariateString.equals("DinucCovariate") ) - throw new UserException.CommandLineException("DinucCovariate has been retired. Please use its successor covariate " + - "ContextCovariate instead, which includes the 2 bp (dinuc) substitution model of the retired DinucCovariate " + - "as well as an indel context to model the indel error rates"); - - boolean foundClass = false; - for (Class covClass : covariateClasses) { - if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class - foundClass = true; - if (!requiredClasses.contains(covClass) && - (argumentCollection.DO_NOT_USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) { - try { - final Covariate covariate = covClass.newInstance(); // now that we've found a matching class, try to instantiate it - optionalCovariates.add(covariate); - } catch (Exception e) { - throw new DynamicClassResolutionException(covClass, e); - } - } - } - } - - if (!foundClass) { - throw new UserException.CommandLineException("The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates."); - } - } - } - return new Pair, ArrayList>(requiredCovariates, optionalCovariates); - } - - /** - * Adds the required covariates to a covariate list - * - * Note: this method really only checks if the classes object has the expected number of required covariates, then add them by hand. - * - * @param classes list of classes to add to the covariate list - * @return the covariate list - */ - private static ArrayList addRequiredCovariatesToList(List> classes) { - ArrayList dest = new ArrayList(classes.size()); - if (classes.size() != 2) - throw new ReviewedGATKException("The number of required covariates has changed, this is a hard change in the code and needs to be inspected"); - - dest.add(new ReadGroupCovariate()); // enforce the order with RG first and QS next. - dest.add(new QualityScoreCovariate()); - return dest; - } - - /** - * Adds the standard covariates to a covariate list - * - * @param classes list of classes to add to the covariate list - * @return the covariate list - */ - private static ArrayList addStandardCovariatesToList(List> classes) { - ArrayList dest = new ArrayList(classes.size()); - for (Class covClass : classes) { - try { - final Covariate covariate = (Covariate) covClass.newInstance(); - dest.add(covariate); - } catch (Exception e) { - throw new DynamicClassResolutionException(covClass, e); - } - } - return dest; - } - - /** - * Print a list of all available covariates to logger as info - * - * @param logger - */ - public static void listAvailableCovariates(final Logger logger) { - logger.info("Available covariates:"); - for (final Class covClass : new PluginManager(Covariate.class).getPlugins()) { - logger.info(String.format("\t%30s\t%s", covClass.getSimpleName(), JVMUtils.classInterfaces(covClass))); - } - } - - /** - * Component used to print out csv representation of the reports that can be use to perform analysis in - * external tools. E.g. generate plots using R scripts. - *

    - * A header is always printed into the output stream (or file) when the printer is created. Then you only need - * to call {@link #print(RecalibrationReport,String) print} for each report you want to include in the csv file. - * Once finished, you close the printer calling {@link #close() close} - * - */ - private static class CsvPrinter { - - private final PrintStream ps; - private final Covariate[] covariates; - - /** - * Constructs a printer redirected to an output file. - * @param out the output file. - * @param c covariates to print out. - * @throws FileNotFoundException if the file could not be created anew. - */ - protected CsvPrinter(final File out, final Covariate ... c) - throws FileNotFoundException { - this(new FileOutputStream(out), c); - } - - /** - * Constructs a printer redirected to an output stream - * @param os the output. - * @param c covariates to print out. - */ - protected CsvPrinter(final OutputStream os, final Covariate ... c) { - covariates = c == null ? new Covariate[0] : c.clone(); - ps = new PrintStream(os); - printHeader(); - } - - /** - * Prints the header out. - *

    - * Should only be invoked at creation. - */ - protected void printHeader() { - RecalUtils.printHeader(ps); - } - - /** - * Prints out a report into the csv file. - * - * - * @param report the report to print out. - * @param mode the report associated mode. (typically ORIGINAL, RECALIBRATED - */ - public void print(final RecalibrationReport report, final String mode) { - RecalUtils.writeCSV(ps,report.getRecalibrationTables(),mode,covariates,false); - } - - /** - * Close the csv printer. - * - * No further output will be allowed or take place after calling this method. - */ - public void close() { - ps.close(); - } - - } - - /** - * Returns a csv output printer. - * - * @param out the output file. It will be overridden - * @param c list of covariates to print out. - * - * @throws FileNotFoundException if out could not be created anew. - * - * @return never null - */ - protected static CsvPrinter csvPrinter(final File out, final Covariate ... c) - throws FileNotFoundException - { - if (c == null) { - throw new IllegalArgumentException("the input covariate array cannot be null"); - } - return new CsvPrinter(out,c); - } - - /** - * Prints out a collection of reports into a file in Csv format in a way - * that can be used by R scripts (such as the plot generator script). - *

    - * The set of covariates is take as the minimum common set from all reports. - * - * @param out the output file. It will be overridden. - * @param reports map where keys are the unique 'mode' (ORIGINAL, RECALIBRATED, ...) - * of each report and the corresponding value the report itself. - * @throws FileNotFoundException if out could not be created anew. - */ - public static void generateCsv(final File out, final Map reports) - throws FileNotFoundException { - if (reports.size() == 0) { - writeCsv(out, reports, new Covariate[0]); - } else { - final Iterator rit = reports.values().iterator(); - final RecalibrationReport first = rit.next(); - final Covariate[] firstCovariates = first.getRequestedCovariates(); - final Set covariates = new LinkedHashSet<>(); - Utils.addAll(covariates,firstCovariates); - while (rit.hasNext() && covariates.size() > 0) { - final Covariate[] nextCovariates = rit.next().getRequestedCovariates(); - final Set nextCovariateNames = new LinkedHashSet(nextCovariates.length); - for (final Covariate nc : nextCovariates) { - nextCovariateNames.add(nc.getClass().getSimpleName()); - } - final Iterator cit = covariates.iterator(); - while (cit.hasNext()) { - if (!nextCovariateNames.contains(cit.next().getClass().getSimpleName())) { - cit.remove(); - } - } - } - writeCsv(out, reports, covariates.toArray(new Covariate[covariates.size()])); - } - } - - /** - * Print out a collection of reports into a file in Csv format in a way - * that can be used by R scripts (such as the plot generator script). - * - * @param out - * @param reports map where keys are the unique 'mode' (ORIGINAL, RECALIBRATED, ...) - * of each report and the corresponding value the report itself. - * @param c the covariates to print out. - * @throws FileNotFoundException if out could not be created anew. - */ - private static void writeCsv(final File out, - final Map reports, final Covariate[] c) - throws FileNotFoundException { - final CsvPrinter p = csvPrinter(out,c); - for (Map.Entry e : reports.entrySet()) { - p.print(e.getValue(),e.getKey()); - } - p.close(); - } - - public enum SOLID_RECAL_MODE { - /** - * Treat reference inserted bases as reference matching bases. Very unsafe! - */ - DO_NOTHING, - /** - * Set reference inserted bases and the previous base (because of color space alignment details) to Q0. This is the default option. - */ - SET_Q_ZERO, - /** - * In addition to setting the quality scores to zero, also set the base itself to 'N'. This is useful to visualize in IGV. - */ - SET_Q_ZERO_BASE_N, - /** - * Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference. - */ - REMOVE_REF_BIAS; - - public static SOLID_RECAL_MODE recalModeFromString(String recalMode) { - if (recalMode.equals("DO_NOTHING")) - return SOLID_RECAL_MODE.DO_NOTHING; - if (recalMode.equals("SET_Q_ZERO")) - return SOLID_RECAL_MODE.SET_Q_ZERO; - if (recalMode.equals("SET_Q_ZERO_BASE_N")) - return SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N; - if (recalMode.equals("REMOVE_REF_BIAS")) - return SOLID_RECAL_MODE.REMOVE_REF_BIAS; - - throw new UserException.BadArgumentValue(recalMode, "is not a valid SOLID_RECAL_MODE value"); - } - } - - public enum SOLID_NOCALL_STRATEGY { - /** - * When a no call is detected throw an exception to alert the user that recalibrating this SOLiD data is unsafe. This is the default option. - */ - THROW_EXCEPTION, - /** - * Leave the read in the output bam completely untouched. This mode is only okay if the no calls are very rare. - */ - LEAVE_READ_UNRECALIBRATED, - /** - * Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses. - */ - PURGE_READ; - - public static SOLID_NOCALL_STRATEGY nocallStrategyFromString(String nocallStrategy) { - if (nocallStrategy.equals("THROW_EXCEPTION")) - return SOLID_NOCALL_STRATEGY.THROW_EXCEPTION; - if (nocallStrategy.equals("LEAVE_READ_UNRECALIBRATED")) - return SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED; - if (nocallStrategy.equals("PURGE_READ")) - return SOLID_NOCALL_STRATEGY.PURGE_READ; - - throw new UserException.BadArgumentValue(nocallStrategy, "is not a valid SOLID_NOCALL_STRATEGY value"); - } - } - - private static List generateReportTables(final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) { - List result = new LinkedList(); - int reportTableIndex = 0; - int rowIndex = 0; - final Map covariateNameMap = new HashMap(requestedCovariates.length); - for (final Covariate covariate : requestedCovariates) - covariateNameMap.put(covariate, parseCovariateName(covariate)); - - for (int tableIndex = 0; tableIndex < recalibrationTables.numTables(); tableIndex++) { - - final ArrayList> columnNames = new ArrayList>(); // initialize the array to hold the column names - columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[0]), "%s")); // save the required covariate name so we can reference it in the future - if (tableIndex != RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) { - columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[1]), "%s")); // save the required covariate name so we can reference it in the future - if (tableIndex >= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) { - columnNames.add(covariateValue); - columnNames.add(covariateName); - } - } - - columnNames.add(eventType); // the order of these column names is important here - columnNames.add(empiricalQuality); - if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) - columnNames.add(estimatedQReported); // only the read group table needs the estimated Q reported - columnNames.add(nObservations); - columnNames.add(nErrors); - - final GATKReportTable reportTable; - if (tableIndex <= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) { - if(sortByCols) { - reportTable = new GATKReportTable("RecalTable" + reportTableIndex++, "", columnNames.size(), GATKReportTable.TableSortingWay.SORT_BY_COLUMN); - } else { - reportTable = new GATKReportTable("RecalTable" + reportTableIndex++, "", columnNames.size(), GATKReportTable.TableSortingWay.DO_NOT_SORT); - } - for (final Pair columnName : columnNames) - reportTable.addColumn(columnName.getFirst(), columnName.getSecond()); - rowIndex = 0; // reset the row index since we're starting with a new table - } else { - reportTable = result.get(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()); - } - - final NestedIntegerArray table = recalibrationTables.getTable(tableIndex); - for (final NestedIntegerArray.Leaf row : table.getAllLeaves()) { - final RecalDatum datum = (RecalDatum)row.value; - final int[] keys = row.keys; - - int columnIndex = 0; - int keyIndex = 0; - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), requestedCovariates[0].formatKey(keys[keyIndex++])); - if (tableIndex != RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) { - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), requestedCovariates[1].formatKey(keys[keyIndex++])); - if (tableIndex >= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) { - final Covariate covariate = requestedCovariates[tableIndex]; - - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), covariate.formatKey(keys[keyIndex++])); - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), covariateNameMap.get(covariate)); - } - } - - final EventType event = EventType.eventFrom(keys[keyIndex]); - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), event.toString()); - - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEmpiricalQuality()); - if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getNumObservations()); - reportTable.set(rowIndex, columnNames.get(columnIndex).getFirst(), datum.getNumMismatches()); - - rowIndex++; - } - result.add(reportTable); - } - - return result; - } - - private static String parseCovariateName(final Covariate covariate) { - return covariate.getClass().getSimpleName().split("Covariate")[0]; - } - - /** - * Return a human-readable string representing the used covariates - * - * @param requestedCovariates a vector of covariates - * @return a non-null comma-separated string - */ - public static String covariateNames(final Covariate[] requestedCovariates) { - final List names = new ArrayList(requestedCovariates.length); - for ( final Covariate cov : requestedCovariates ) - names.add(cov.getClass().getSimpleName()); - return Utils.join(",", names); - } - - /** - * Outputs the GATK report to RAC.RECAL_TABLE. - * - * @param RAC The list of shared command line arguments - * @param quantizationInfo Quantization info - * @param recalibrationTables Recalibration tables - * @param requestedCovariates The list of requested covariates - * @param sortByCols True to use GATKReportTable.TableSortingWay.SORT_BY_COLUMN, false to use GATKReportTable.TableSortingWay.DO_NOT_SORT - */ - public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) { - final GATKReport report = createRecalibrationGATKReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols)); - report.print(RAC.RECAL_TABLE); - } - - /** - * Creates a consolidated GATK report, first generating report tables. Report can then be written to a stream via GATKReport.print(PrintStream). - * - * @param argumentTable Argument table - * @param quantizationInfo Quantization info - * @param recalibrationTables Recalibration tables - * @param requestedCovariates The list of requested covariates - * @param sortByCols True to use GATKReportTable.TableSortingWay.SORT_BY_COLUMN, false to use GATKReportTable.TableSortingWay.DO_NOT_SORT - * @return GATK report - */ - public static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final boolean sortByCols) { - return createRecalibrationGATKReport(argumentTable, quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols)); - } - - /** - * Creates a consolidated GATK report from the tables. Report can then be written to a stream via GATKReport.print(PrintStream). - * - * @param argumentTable Argument table - * @param quantizationTable Quantization Table - * @param recalTables Other recal tables - * @return GATK report - */ - private static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final GATKReportTable quantizationTable, final List recalTables) { - final GATKReport report = new GATKReport(); - report.addTable(argumentTable); - report.addTable(quantizationTable); - report.addTables(recalTables); - return report; - } - - /** s - * Write recalibration plots into a file - * - * @param csvFile location of the intermediary file - * @param exampleReportFile where the report arguments are collected from. - * @param output result plot file name. - */ - public static void generatePlots(final File csvFile, final File exampleReportFile, final File output) { - final RScriptExecutor executor = new RScriptExecutor(); - executor.setExceptOnError(true); - executor.addScript(new Resource(SCRIPT_FILE, RecalUtils.class)); - executor.addArgs(csvFile.getAbsolutePath()); - executor.addArgs(exampleReportFile.getAbsolutePath()); - executor.addArgs(output.getAbsolutePath()); - Logger.getLogger(RecalUtils.class).debug("R command line: " + executor.getApproximateCommandLine()); - executor.exec(); - } - - private static void outputRecalibrationPlot(final File csvFile, final RecalibrationArgumentCollection RAC) { - - final RScriptExecutor executor = new RScriptExecutor(); - executor.addScript(new Resource(SCRIPT_FILE, RecalUtils.class)); - executor.addArgs(csvFile.getAbsolutePath()); - executor.addArgs(RAC.RECAL_TABLE_FILE.getAbsolutePath()); - executor.exec(); - } - - /** - * Please use {@link #generateCsv(java.io.File, java.util.Map)} and {@link #generatePlots(java.io.File, java.io.File, java.io.File)} instead. - * - * @deprecated - */ - @Deprecated - public static void generateRecalibrationPlot(final RecalibrationArgumentCollection RAC, final RecalibrationTables original, final Covariate[] requestedCovariates) { - generateRecalibrationPlot(RAC, original, null, requestedCovariates); - } - - /** - * Please use {@link #generateCsv(java.io.File, java.util.Map)} and {@link #generatePlots(java.io.File, java.io.File, java.io.File)} instead. - * - * @deprecated - */ - @Deprecated - public static void generateRecalibrationPlot(final RecalibrationArgumentCollection RAC, final RecalibrationTables original, final RecalibrationTables recalibrated, final Covariate[] requestedCovariates) { - final PrintStream csvStream; - final File csvTempFile = null; - try { - File csvTmpFile = File.createTempFile("BQSR",".csv"); - csvTmpFile.deleteOnExit(); - csvStream = new PrintStream(csvTmpFile); - } catch (IOException e) { - throw new UserException("Could not create temporary csv file", e); - } - - if ( recalibrated != null ) - writeCSV(csvStream, recalibrated, "RECALIBRATED", requestedCovariates, true); - writeCSV(csvStream, original, "ORIGINAL", requestedCovariates, recalibrated == null); - csvStream.close(); - outputRecalibrationPlot(csvTempFile, RAC); - csvTempFile.delete(); - } - - private static void writeCSV(final PrintStream deltaTableFile, final RecalibrationTables recalibrationTables, final String recalibrationMode, final Covariate[] requestedCovariates, final boolean printHeader) { - - final NestedIntegerArray deltaTable = createDeltaTable(recalibrationTables, requestedCovariates.length); - - // add the quality score table to the delta table - final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); - for (final NestedIntegerArray.Leaf leaf : qualTable.getAllLeaves()) { // go through every element in the covariates table to create the delta table - final int[] newCovs = new int[4]; - newCovs[0] = leaf.keys[0]; - newCovs[1] = requestedCovariates.length; // replace the covariate name with an arbitrary (unused) index for QualityScore - newCovs[2] = leaf.keys[1]; - newCovs[3] = leaf.keys[2]; - addToDeltaTable(deltaTable, newCovs, (RecalDatum)leaf.value); // add this covariate to the delta table - } - - // add the optional covariates to the delta table - for (int i = RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal(); i < requestedCovariates.length; i++) { - final NestedIntegerArray covTable = recalibrationTables.getTable(i); - for (final NestedIntegerArray.Leaf leaf : covTable.getAllLeaves()) { - final int[] covs = new int[4]; - covs[0] = leaf.keys[0]; - covs[1] = i; // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS) - covs[2] = leaf.keys[2]; - covs[3] = leaf.keys[3]; - addToDeltaTable(deltaTable, covs, (RecalDatum) leaf.value); // add this covariate to the delta table - } - } - - // output the csv file - if (printHeader) { - printHeader(deltaTableFile); - } - - final Map covariateNameMap = new HashMap(requestedCovariates.length); - for (final Covariate covariate : requestedCovariates) - covariateNameMap.put(covariate, parseCovariateName(covariate)); - - // print each data line - for (final NestedIntegerArray.Leaf leaf : deltaTable.getAllLeaves()) { - final List deltaKeys = generateValuesFromKeys(leaf.keys, requestedCovariates, covariateNameMap); - final RecalDatum deltaDatum = leaf.value; - deltaTableFile.print(Utils.join(",", deltaKeys)); - deltaTableFile.print("," + deltaDatum.stringForCSV()); - deltaTableFile.println("," + recalibrationMode); - } - } - - private static void printHeader(PrintStream out) { - final List header = new LinkedList(); - header.add("ReadGroup"); - header.add("CovariateValue"); - header.add("CovariateName"); - header.add("EventType"); - header.add("Observations"); - header.add("Errors"); - header.add("EmpiricalQuality"); - header.add("AverageReportedQuality"); - header.add("Accuracy"); - header.add("Recalibration"); - out.println(Utils.join(",", header)); - } - - /* - * Return an initialized nested integer array with appropriate dimensions for use with the delta tables - * - * @param recalibrationTables the recal tables - * @param numCovariates the total number of covariates being used - * @return a non-null nested integer array - */ - @Requires("recalibrationTables != null && numCovariates > 0") - @Ensures("result != null") - private static NestedIntegerArray createDeltaTable(final RecalibrationTables recalibrationTables, final int numCovariates) { - - final int[] dimensionsForDeltaTable = new int[4]; - - // initialize the dimensions with those of the qual table to start with - final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); - final int[] dimensionsOfQualTable = qualTable.getDimensions(); - dimensionsForDeltaTable[0] = dimensionsOfQualTable[0]; // num read groups - dimensionsForDeltaTable[1] = numCovariates + 1; // num covariates - dimensionsForDeltaTable[2] = dimensionsOfQualTable[1]; - dimensionsForDeltaTable[3] = dimensionsOfQualTable[2]; - - // now, update the dimensions based on the optional covariate tables as needed - for ( int i = RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal(); i < numCovariates; i++ ) { - final NestedIntegerArray covTable = recalibrationTables.getTable(i); - final int[] dimensionsOfCovTable = covTable.getDimensions(); - dimensionsForDeltaTable[2] = Math.max(dimensionsForDeltaTable[2], dimensionsOfCovTable[2]); - dimensionsForDeltaTable[3] = Math.max(dimensionsForDeltaTable[3], dimensionsOfCovTable[3]); - } - - return new NestedIntegerArray(dimensionsForDeltaTable); - } - - protected static List generateValuesFromKeys(final int[] keys, final Covariate[] covariates, final Map covariateNameMap) { - final List values = new ArrayList(4); - values.add(covariates[RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()].formatKey(keys[0])); - - final int covariateIndex = keys[1]; - final int covariateKey = keys[2]; - final Covariate covariate = covariateIndex == covariates.length ? covariates[RecalibrationTables.TableType.QUALITY_SCORE_TABLE.ordinal()] : covariates[covariateIndex]; - values.add(covariate.formatKey(covariateKey)); - values.add(covariateNameMap.get(covariate)); - values.add(EventType.eventFrom(keys[3]).prettyPrint()); - - return values; - } - - /** - * Updates the current RecalDatum element in the delta table. - * - * If it doesn't have an element yet, it creates an RecalDatum element and adds it to the delta table. - * - * @param deltaTable the delta table - * @param deltaKey the key to the table - * @param recalDatum the recal datum to combine with the accuracyDatum element in the table - */ - private static void addToDeltaTable(final NestedIntegerArray deltaTable, final int[] deltaKey, final RecalDatum recalDatum) { - final RecalDatum deltaDatum = deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key - if (deltaDatum == null) - // if we don't have a key yet, create a new one with the same values as the current datum - deltaTable.put(new RecalDatum(recalDatum), deltaKey); - else - // if we do have a datum, combine it with this one - deltaDatum.combine(recalDatum); - } - - /** - * Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string - * - * @param read The read to adjust - * @param RAC The list of shared command line arguments - */ - public static void parsePlatformForRead(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) { - GATKSAMReadGroupRecord readGroup = read.getReadGroup(); - - if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) { - readGroup.setPlatform(RAC.FORCE_PLATFORM); - } - - if (readGroup.getPlatform() == null) { - if (RAC.DEFAULT_PLATFORM != null) { - if (!warnUserNullPlatform) { - Utils.warnUser("The input .bam file contains reads with no platform information. " + - "Defaulting to platform = " + RAC.DEFAULT_PLATFORM + ". " + - "First observed at read with name = " + read.getReadName()); - warnUserNullPlatform = true; - } - readGroup.setPlatform(RAC.DEFAULT_PLATFORM); - } - else { - throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName()); - } - } - } - - /** - * Parse through the color space of the read and add a new tag to the SAMRecord that says which bases are - * inconsistent with the color space. If there is a no call in the color space, this method returns false meaning - * this read should be skipped - * - * @param strategy the strategy used for SOLID no calls - * @param read The SAMRecord to parse - * @return true if this read is consistent or false if this read should be skipped - */ - public static boolean isColorSpaceConsistent(final SOLID_NOCALL_STRATEGY strategy, final GATKSAMRecord read) { - if (!ReadUtils.isSOLiDRead(read)) // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base - return true; - - // Haven't calculated the inconsistency array yet for this read - if (read.getAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG) == null) { - final Object attr = read.getAttribute(RecalUtils.COLOR_SPACE_ATTRIBUTE_TAG); - if (attr != null) { - byte[] colorSpace; - if (attr instanceof String) - colorSpace = ((String) attr).getBytes(); - else - throw new UserException.MalformedBAM(read, String.format("Value encoded by %s in %s isn't a string!", RecalUtils.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); - - final boolean badColor = hasNoCallInColorSpace(colorSpace); - if (badColor) { - if (strategy == SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED) { - return false; // can't recalibrate a SOLiD read with no calls in the color space, and the user wants to skip over them - } - else if (strategy == SOLID_NOCALL_STRATEGY.PURGE_READ) { - read.setReadFailsVendorQualityCheckFlag(true); - return false; - } - } - - byte[] readBases = read.getReadBases(); // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read - if (read.getReadNegativeStrandFlag()) - readBases = BaseUtils.simpleReverseComplement(read.getReadBases()); - - final byte[] inconsistency = new byte[readBases.length]; - int i; - byte prevBase = colorSpace[0]; // The sentinel - for (i = 0; i < readBases.length; i++) { - final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[i + 1]); - inconsistency[i] = (byte) (thisBase == readBases[i] ? 0 : 1); - prevBase = readBases[i]; - } - read.setAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency); - } - else if (strategy == SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) // if the strategy calls for an exception, throw it - throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); - - else - return false; // otherwise, just skip the read - } - - return true; - } - - private static boolean hasNoCallInColorSpace(final byte[] colorSpace) { - final int length = colorSpace.length; - for (int i = 1; i < length; i++) { // skip the sentinal - final byte color = colorSpace[i]; - if (color != (byte) '0' && color != (byte) '1' && color != (byte) '2' && color != (byte) '3') { - return true; // There is a bad color in this SOLiD read - } - } - - return false; // There aren't any color no calls in this SOLiD read - } - - /** - * Given the base and the color calculate the next base in the sequence - * - * @param read the read - * @param prevBase The base - * @param color The color - * @return The next base in the sequence - */ - private static byte getNextBaseFromColor(GATKSAMRecord read, final byte prevBase, final byte color) { - switch (color) { - case '0': - return prevBase; - case '1': - return performColorOne(prevBase); - case '2': - return performColorTwo(prevBase); - case '3': - return performColorThree(prevBase); - default: - throw new UserException.MalformedBAM(read, "Unrecognized color space in SOLID read, color = " + (char) color + - " Unfortunately this bam file can not be recalibrated without full color space information because of potential reference bias."); - } - } - - /** - * Check if this base is inconsistent with its color space. If it is then SOLID inserted the reference here and we should reduce the quality - * - * @param read The read which contains the color space to check against - * @param offset The offset in the read at which to check - * @return Returns true if the base was inconsistent with the color space - */ - public static boolean isColorSpaceConsistent(final GATKSAMRecord read, final int offset) { - final Object attr = read.getAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG); - if (attr != null) { - final byte[] inconsistency = (byte[]) attr; - // NOTE: The inconsistency array is in the direction of the read, not aligned to the reference! - if (read.getReadNegativeStrandFlag()) { // Negative direction - return inconsistency[inconsistency.length - offset - 1] == (byte) 0; - } - else { // Forward direction - return inconsistency[offset] == (byte) 0; - } - - // This block of code is for if you want to check both the offset and the next base for color space inconsistency - //if( read.getReadNegativeStrandFlag() ) { // Negative direction - // if( offset == 0 ) { - // return inconsistency[0] != 0; - // } else { - // return (inconsistency[inconsistency.length - offset - 1] != 0) || (inconsistency[inconsistency.length - offset] != 0); - // } - //} else { // Forward direction - // if( offset == inconsistency.length - 1 ) { - // return inconsistency[inconsistency.length - 1] != 0; - // } else { - // return (inconsistency[offset] != 0) || (inconsistency[offset + 1] != 0); - // } - //} - - } - else { // No inconsistency array, so nothing is inconsistent - return true; - } - } - - /** - * Computes all requested covariates for every offset in the given read - * by calling covariate.getValues(..). - * - * It populates an array of covariate values where result[i][j] is the covariate - * value for the ith position in the read and the jth covariate in - * reqeustedCovariates list. - * - * @param read The read for which to compute covariate values. - * @param requestedCovariates The list of requested covariates. - * @return a matrix with all the covariates calculated for every base in the read - */ - public static ReadCovariates computeCovariates(final GATKSAMRecord read, final Covariate[] requestedCovariates) { - final ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), requestedCovariates.length); - computeCovariates(read, requestedCovariates, readCovariates); - return readCovariates; - } - - /** - * Computes all requested covariates for every offset in the given read - * by calling covariate.getValues(..). - * - * It populates an array of covariate values where result[i][j] is the covariate - * value for the ith position in the read and the jth covariate in - * reqeustedCovariates list. - * - * @param read The read for which to compute covariate values. - * @param requestedCovariates The list of requested covariates. - * @param resultsStorage The object to store the covariate values - */ - public static void computeCovariates(final GATKSAMRecord read, final Covariate[] requestedCovariates, final ReadCovariates resultsStorage) { - // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read - for (int i = 0; i < requestedCovariates.length; i++) { - resultsStorage.setCovariateIndex(i); - requestedCovariates[i].recordValues(read, resultsStorage); - } - } - - /** - * Perform a certain transversion (A <-> C or G <-> T) on the base. - * - * @param base the base [AaCcGgTt] - * @return the transversion of the base, or the input base if it's not one of the understood ones - */ - private static byte performColorOne(byte base) { - switch (base) { - case 'A': - case 'a': - return 'C'; - case 'C': - case 'c': - return 'A'; - case 'G': - case 'g': - return 'T'; - case 'T': - case 't': - return 'G'; - default: - return base; - } - } - - /** - * Perform a transition (A <-> G or C <-> T) on the base. - * - * @param base the base [AaCcGgTt] - * @return the transition of the base, or the input base if it's not one of the understood ones - */ - private static byte performColorTwo(byte base) { - switch (base) { - case 'A': - case 'a': - return 'G'; - case 'C': - case 'c': - return 'T'; - case 'G': - case 'g': - return 'A'; - case 'T': - case 't': - return 'C'; - default: - return base; - } - } - - /** - * Return the complement (A <-> T or C <-> G) of a base. - * - * @param base the base [AaCcGgTt] - * @return the complementary base, or the input base if it's not one of the understood ones - */ - private static byte performColorThree(byte base) { - switch (base) { - case 'A': - case 'a': - return 'T'; - case 'C': - case 'c': - return 'G'; - case 'G': - case 'g': - return 'C'; - case 'T': - case 't': - return 'A'; - default: - return base; - } - } - - /** - * Combines the recalibration data for table1 and table2 into table1 - * - * Note that table1 is the destination, so it is modified - * - * @param table1 the destination table to merge table2 into - * @param table2 the source table to merge into table1 - */ - public static void combineTables(final NestedIntegerArray table1, final NestedIntegerArray table2) { - if ( table1 == null ) throw new IllegalArgumentException("table1 cannot be null"); - if ( table2 == null ) throw new IllegalArgumentException("table2 cannot be null"); - if ( ! Arrays.equals(table1.getDimensions(), table2.getDimensions())) - throw new IllegalArgumentException("Table1 " + Utils.join(",", table1.getDimensions()) + " not equal to " + Utils.join(",", table2.getDimensions())); - - for (final NestedIntegerArray.Leaf row : table2.getAllLeaves()) { - final RecalDatum myDatum = table1.get(row.keys); - - if (myDatum == null) - table1.put(row.value, row.keys); - else - myDatum.combine(row.value); - } - } - - /** - * Increments the RecalDatum at the specified position in the specified table, or put a new item there - * if there isn't already one. - * - * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() - * to return false if another thread inserts a new item at our position in the middle of our put operation. - * - * @param table the table that holds/will hold our item - * @param qual qual for this event - * @param isError error value for this event - * @param keys location in table of our item - */ - public static void incrementDatumOrPutIfNecessary( final NestedIntegerArray table, - final byte qual, - final double isError, - final int... keys ) { - final RecalDatum existingDatum = table.get(keys); - - if ( existingDatum == null ) { - // No existing item, try to put a new one - if ( ! table.put(createDatumObject(qual, isError), keys) ) { - // Failed to put a new item because another thread came along and put an item here first. - // Get the newly-put item and increment it (item is guaranteed to exist at this point) - table.get(keys).increment(1L, isError); - } - } - else { - // Easy case: already an item here, so increment it - existingDatum.increment(1L, isError); - } - } - - /** - * creates a datum object with one observation and one or zero error - * - * @param reportedQual the quality score reported by the instrument for this base - * @param isError whether or not the observation is an error - * @return a new RecalDatum object with the observation and the error - */ - private static RecalDatum createDatumObject(final byte reportedQual, final double isError) { - return new RecalDatum(1, isError, reportedQual); - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationReport.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationReport.java deleted file mode 100644 index a2b83ccb6..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationReport.java +++ /dev/null @@ -1,425 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.engine.report.GATKReport; -import org.broadinstitute.gatk.engine.report.GATKReportTable; -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.recalibration.covariates.Covariate; - -import java.io.*; -import java.util.*; - -/** - * This class has all the static functionality for reading a recalibration report file into memory. - * - * @author carneiro - * @since 3/26/12 - */ -public class RecalibrationReport { - private QuantizationInfo quantizationInfo; // histogram containing the counts for qual quantization (calculated after recalibration is done) - private final RecalibrationTables recalibrationTables; // quick access reference to the tables - private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation - private final HashMap optionalCovariateIndexes; - - private final GATKReportTable argumentTable; // keep the argument table untouched just for output purposes - private final RecalibrationArgumentCollection RAC; // necessary for quantizing qualities with the same parameter - - private final int[] tempRGarray = new int[2]; - private final int[] tempQUALarray = new int[3]; - private final int[] tempCOVarray = new int[4]; - - public RecalibrationReport(final File recalFile) { - this(recalFile, getReadGroups(recalFile)); - } - - public RecalibrationReport(final File recalFile, final SortedSet allReadGroups) { - final GATKReport report = new GATKReport(recalFile); - - argumentTable = report.getTable(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE); - RAC = initializeArgumentCollectionTable(argumentTable); - - GATKReportTable quantizedTable = report.getTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE); - quantizationInfo = initializeQuantizationTable(quantizedTable); - - Pair, ArrayList> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates - ArrayList requiredCovariates = covariates.getFirst(); - ArrayList optionalCovariates = covariates.getSecond(); - requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; - optionalCovariateIndexes = new HashMap(optionalCovariates.size()); - int covariateIndex = 0; - for (final Covariate covariate : requiredCovariates) - requestedCovariates[covariateIndex++] = covariate; - for (final Covariate covariate : optionalCovariates) { - requestedCovariates[covariateIndex] = covariate; - final String covariateName = covariate.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport - optionalCovariateIndexes.put(covariateName, covariateIndex-2); - covariateIndex++; - } - - for (Covariate cov : requestedCovariates) - cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection - - recalibrationTables = new RecalibrationTables(requestedCovariates, allReadGroups.size()); - - initializeReadGroupCovariates(allReadGroups); - - parseReadGroupTable(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE), recalibrationTables.getReadGroupTable()); - - parseQualityScoreTable(report.getTable(RecalUtils.QUALITY_SCORE_REPORT_TABLE_TITLE), recalibrationTables.getQualityScoreTable()); - - parseAllCovariatesTable(report.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE), recalibrationTables); - - } - - /** - * Gets the unique read groups in the recal file - * - * @param recalFile the recal file as a GATK Report - * @return the unique read groups - */ - public static SortedSet getReadGroups(final File recalFile) { - return getReadGroups(new GATKReport(recalFile)); - } - - /** - * Gets the unique read groups in the table - * - * @param report the GATKReport containing the table with RecalUtils.READGROUP_REPORT_TABLE_TITLE - * @return the unique read groups - */ - private static SortedSet getReadGroups(final GATKReport report) { - final GATKReportTable reportTable = report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE); - final SortedSet readGroups = new TreeSet(); - for ( int i = 0; i < reportTable.getNumRows(); i++ ) - readGroups.add(reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME).toString()); - return readGroups; - } - - /** - * Combines two recalibration reports by adding all observations and errors - * - * Note: This method DOES NOT recalculate the empirical qualities and quantized qualities. You have to recalculate - * them after combining. The reason for not calculating it is because this function is intended for combining a - * series of recalibration reports, and it only makes sense to calculate the empirical qualities and quantized - * qualities after all the recalibration reports have been combined. Having the user recalculate when appropriate, - * makes this method faster - * - * Note2: The empirical quality reported, however, is recalculated given its simplicity. - * - * @param other the recalibration report to combine with this one - */ - public void combine(final RecalibrationReport other) { - for ( int tableIndex = 0; tableIndex < recalibrationTables.numTables(); tableIndex++ ) { - final NestedIntegerArray myTable = recalibrationTables.getTable(tableIndex); - final NestedIntegerArray otherTable = other.recalibrationTables.getTable(tableIndex); - RecalUtils.combineTables(myTable, otherTable); - } - } - - public QuantizationInfo getQuantizationInfo() { - return quantizationInfo; - } - - public RecalibrationTables getRecalibrationTables() { - return recalibrationTables; - } - - public Covariate[] getRequestedCovariates() { - return requestedCovariates; - } - - /** - * Initialize read group keys using the shared list of all the read groups. - * - * By using the same sorted set of read groups across all recalibration reports, even if - * one report is missing a read group, all the reports use the same read group keys. - * - * @param allReadGroups The list of all possible read groups - */ - private void initializeReadGroupCovariates(final SortedSet allReadGroups) { - for (String readGroup: allReadGroups) { - requestedCovariates[0].keyFromValue(readGroup); - } - } - - /** - * Compiles the list of keys for the Covariates table and uses the shared parsing utility to produce the actual table - * - * @param reportTable the GATKReport table containing data for this table - * @param recalibrationTables the recalibration tables -\ */ - private void parseAllCovariatesTable(final GATKReportTable reportTable, final RecalibrationTables recalibrationTables) { - for ( int i = 0; i < reportTable.getNumRows(); i++ ) { - final Object rg = reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME); - tempCOVarray[0] = requestedCovariates[0].keyFromValue(rg); - final Object qual = reportTable.get(i, RecalUtils.QUALITY_SCORE_COLUMN_NAME); - tempCOVarray[1] = requestedCovariates[1].keyFromValue(qual); - - final String covName = (String)reportTable.get(i, RecalUtils.COVARIATE_NAME_COLUMN_NAME); - final int covIndex = optionalCovariateIndexes.get(covName); - final Object covValue = reportTable.get(i, RecalUtils.COVARIATE_VALUE_COLUMN_NAME); - tempCOVarray[2] = requestedCovariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + covIndex].keyFromValue(covValue); - - final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); - tempCOVarray[3] = event.ordinal(); - - recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + covIndex).put(getRecalDatum(reportTable, i, false), tempCOVarray); - } - } - - /** - * - * Compiles the list of keys for the QualityScore table and uses the shared parsing utility to produce the actual table - * @param reportTable the GATKReport table containing data for this table - * @param qualTable the map representing this table - */ - private void parseQualityScoreTable(final GATKReportTable reportTable, final NestedIntegerArray qualTable) { - for ( int i = 0; i < reportTable.getNumRows(); i++ ) { - final Object rg = reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME); - tempQUALarray[0] = requestedCovariates[0].keyFromValue(rg); - final Object qual = reportTable.get(i, RecalUtils.QUALITY_SCORE_COLUMN_NAME); - tempQUALarray[1] = requestedCovariates[1].keyFromValue(qual); - final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); - tempQUALarray[2] = event.ordinal(); - - qualTable.put(getRecalDatum(reportTable, i, false), tempQUALarray); - } - } - - /** - * Compiles the list of keys for the ReadGroup table and uses the shared parsing utility to produce the actual table - * - * @param reportTable the GATKReport table containing data for this table - * @param rgTable the map representing this table - */ - private void parseReadGroupTable(final GATKReportTable reportTable, final NestedIntegerArray rgTable) { - for ( int i = 0; i < reportTable.getNumRows(); i++ ) { - final Object rg = reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME); - tempRGarray[0] = requestedCovariates[0].keyFromValue(rg); - final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); - tempRGarray[1] = event.ordinal(); - - rgTable.put(getRecalDatum(reportTable, i, true), tempRGarray); - } - } - - private double asDouble(final Object o) { - if ( o instanceof Double ) - return (Double)o; - else if ( o instanceof Integer ) - return (Integer)o; - else if ( o instanceof Long ) - return (Long)o; - else - throw new ReviewedGATKException("Object " + o + " is expected to be either a double, long or integer but it's not either: " + o.getClass()); - } - - private long asLong(final Object o) { - if ( o instanceof Long ) - return (Long)o; - else if ( o instanceof Integer ) - return ((Integer)o).longValue(); - else if ( o instanceof Double ) - return ((Double)o).longValue(); - else - throw new ReviewedGATKException("Object " + o + " is expected to be a long but it's not: " + o.getClass()); - } - - private RecalDatum getRecalDatum(final GATKReportTable reportTable, final int row, final boolean hasEstimatedQReportedColumn) { - final long nObservations = asLong(reportTable.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME)); - final double nErrors = asDouble(reportTable.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME)); - //final double empiricalQuality = asDouble(reportTable.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME)); - - // the estimatedQreported column only exists in the ReadGroup table - final double estimatedQReported = hasEstimatedQReportedColumn ? - (Double) reportTable.get(row, RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME) : // we get it if we are in the read group table - Byte.parseByte((String) reportTable.get(row, RecalUtils.QUALITY_SCORE_COLUMN_NAME)); // or we use the reported quality if we are in any other table - - final RecalDatum datum = new RecalDatum(nObservations, nErrors, (byte)1); - datum.setEstimatedQReported(estimatedQReported); - //datum.setEmpiricalQuality(empiricalQuality); // don't set the value here because we will want to recompute with a different conditional Q score prior value - return datum; - } - - /** - * Parses the quantization table from the GATK Report and turns it into a map of original => quantized quality scores - * - * @param table the GATKReportTable containing the quantization mappings - * @return an ArrayList with the quantization mappings from 0 to MAX_SAM_QUAL_SCORE - */ - private QuantizationInfo initializeQuantizationTable(GATKReportTable table) { - final Byte[] quals = new Byte[QualityUtils.MAX_SAM_QUAL_SCORE + 1]; - final Long[] counts = new Long[QualityUtils.MAX_SAM_QUAL_SCORE + 1]; - for ( int i = 0; i < table.getNumRows(); i++ ) { - final byte originalQual = (byte)i; - final Object quantizedObject = table.get(i, RecalUtils.QUANTIZED_VALUE_COLUMN_NAME); - final Object countObject = table.get(i, RecalUtils.QUANTIZED_COUNT_COLUMN_NAME); - final byte quantizedQual = Byte.parseByte(quantizedObject.toString()); - final long quantizedCount = Long.parseLong(countObject.toString()); - quals[originalQual] = quantizedQual; - counts[originalQual] = quantizedCount; - } - return new QuantizationInfo(Arrays.asList(quals), Arrays.asList(counts)); - } - - /** - * Parses the arguments table from the GATK Report and creates a RAC object with the proper initialization values - * - * @param table the GATKReportTable containing the arguments and its corresponding values - * @return a RAC object properly initialized with all the objects in the table - */ - private RecalibrationArgumentCollection initializeArgumentCollectionTable(GATKReportTable table) { - final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); - - for ( int i = 0; i < table.getNumRows(); i++ ) { - final String argument = table.get(i, "Argument").toString(); - Object value = table.get(i, RecalUtils.ARGUMENT_VALUE_COLUMN_NAME); - if (value.equals("null")) - value = null; // generic translation of null values that were printed out as strings | todo -- add this capability to the GATKReport - - if (argument.equals("covariate") && value != null) - RAC.COVARIATES = value.toString().split(","); - - else if (argument.equals("standard_covs")) - RAC.DO_NOT_USE_STANDARD_COVARIATES = Boolean.parseBoolean((String) value); - - else if (argument.equals("solid_recal_mode")) - RAC.SOLID_RECAL_MODE = RecalUtils.SOLID_RECAL_MODE.recalModeFromString((String) value); - - else if (argument.equals("solid_nocall_strategy")) - RAC.SOLID_NOCALL_STRATEGY = RecalUtils.SOLID_NOCALL_STRATEGY.nocallStrategyFromString((String) value); - - else if (argument.equals("mismatches_context_size")) - RAC.MISMATCHES_CONTEXT_SIZE = Integer.parseInt((String) value); - - else if (argument.equals("indels_context_size")) - RAC.INDELS_CONTEXT_SIZE = Integer.parseInt((String) value); - - else if (argument.equals("mismatches_default_quality")) - RAC.MISMATCHES_DEFAULT_QUALITY = Byte.parseByte((String) value); - - else if (argument.equals("insertions_default_quality")) - RAC.INSERTIONS_DEFAULT_QUALITY = Byte.parseByte((String) value); - - else if (argument.equals("deletions_default_quality")) - RAC.DELETIONS_DEFAULT_QUALITY = Byte.parseByte((String) value); - - else if (argument.equals("maximum_cycle_value")) - RAC.MAXIMUM_CYCLE_VALUE = Integer.parseInt((String) value); - - else if (argument.equals("low_quality_tail")) - RAC.LOW_QUAL_TAIL = Byte.parseByte((String) value); - - else if (argument.equals("default_platform")) - RAC.DEFAULT_PLATFORM = (String) value; - - else if (argument.equals("force_platform")) - RAC.FORCE_PLATFORM = (String) value; - - else if (argument.equals("quantizing_levels")) - RAC.QUANTIZING_LEVELS = Integer.parseInt((String) value); - - else if (argument.equals("recalibration_report")) - RAC.existingRecalibrationReport = (value == null) ? null : new File((String) value); - - else if (argument.equals("binary_tag_name")) - RAC.BINARY_TAG_NAME = (value == null) ? null : (String) value; - - else if (argument.equals("sort_by_all_columns")) - RAC.SORT_BY_ALL_COLUMNS = Boolean.parseBoolean((String) value); - } - - return RAC; - } - - /** - * this functionality avoids recalculating the empirical qualities, estimated reported quality - * and quantization of the quality scores during every call of combine(). Very useful for the BQSRGatherer. - */ - public void calculateQuantizedQualities() { - quantizationInfo = new QuantizationInfo(recalibrationTables, RAC.QUANTIZING_LEVELS); - } - - /** - * Creates the recalibration report. Report can then be written to a stream via GATKReport.print(PrintStream). - * - * @return newly created recalibration report - */ - public GATKReport createGATKReport() { - return RecalUtils.createRecalibrationGATKReport(argumentTable, quantizationInfo, recalibrationTables, requestedCovariates, RAC.SORT_BY_ALL_COLUMNS); - } - - public RecalibrationArgumentCollection getRAC() { - return RAC; - } - - /** - * - * @deprecated use {@link #getRequestedCovariates()} instead. - */ - @Deprecated - public Covariate[] getCovariates() { - return requestedCovariates; - } - - /** - * @return true if the report has no data - */ - public boolean isEmpty() { - return recalibrationTables.isEmpty(); - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationTables.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationTables.java deleted file mode 100644 index e1c7820a4..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationTables.java +++ /dev/null @@ -1,168 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import com.google.java.contract.Ensures; -import org.broadinstitute.gatk.utils.collections.LoggingNestedIntegerArray; -import org.broadinstitute.gatk.utils.recalibration.covariates.Covariate; -import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; - -import java.io.PrintStream; -import java.util.ArrayList; - -/** - * Utility class to facilitate on-the-fly base quality score recalibration. - * - * User: ebanks - * Date: 6/20/12 - */ - -public final class RecalibrationTables { - public enum TableType { - READ_GROUP_TABLE, - QUALITY_SCORE_TABLE, - OPTIONAL_COVARIATE_TABLES_START; - } - - private final ArrayList> tables; - private final int qualDimension; - private final int eventDimension = EventType.values().length; - private final int numReadGroups; - private final PrintStream log; - - public RecalibrationTables(final Covariate[] covariates) { - this(covariates, covariates[TableType.READ_GROUP_TABLE.ordinal()].maximumKeyValue() + 1, null); - } - - public RecalibrationTables(final Covariate[] covariates, final int numReadGroups) { - this(covariates, numReadGroups, null); - } - - public RecalibrationTables(final Covariate[] covariates, final int numReadGroups, final PrintStream log) { - tables = new ArrayList>(covariates.length); - for ( int i = 0; i < covariates.length; i++ ) - tables.add(i, null); // initialize so we can set below - - qualDimension = covariates[TableType.QUALITY_SCORE_TABLE.ordinal()].maximumKeyValue() + 1; - this.numReadGroups = numReadGroups; - this.log = log; - - tables.set(TableType.READ_GROUP_TABLE.ordinal(), - log == null ? new NestedIntegerArray(numReadGroups, eventDimension) : - new LoggingNestedIntegerArray(log, "READ_GROUP_TABLE", numReadGroups, eventDimension)); - - tables.set(TableType.QUALITY_SCORE_TABLE.ordinal(), makeQualityScoreTable()); - - for (int i = TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal(); i < covariates.length; i++) - tables.set(i, - log == null ? new NestedIntegerArray(numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension) : - new LoggingNestedIntegerArray(log, String.format("OPTIONAL_COVARIATE_TABLE_%d", i - TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + 1), - numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension)); - } - - @Ensures("result != null") - public NestedIntegerArray getReadGroupTable() { - return getTable(TableType.READ_GROUP_TABLE.ordinal()); - } - - @Ensures("result != null") - public NestedIntegerArray getQualityScoreTable() { - return getTable(TableType.QUALITY_SCORE_TABLE.ordinal()); - } - - @Ensures("result != null") - public NestedIntegerArray getTable(final int index) { - return tables.get(index); - } - - @Ensures("result >= 0") - public int numTables() { - return tables.size(); - } - - /** - * @return true if all the tables contain no RecalDatums - */ - public boolean isEmpty() { - for( final NestedIntegerArray table : tables ) { - if( !table.getAllValues().isEmpty() ) { return false; } - } - return true; - } - - /** - * Allocate a new quality score table, based on requested parameters - * in this set of tables, without any data in it. The return result - * of this table is suitable for acting as a thread-local cache - * for quality score values - * @return a newly allocated, empty read group x quality score table - */ - public NestedIntegerArray makeQualityScoreTable() { - return log == null - ? new NestedIntegerArray(numReadGroups, qualDimension, eventDimension) - : new LoggingNestedIntegerArray(log, "QUALITY_SCORE_TABLE", numReadGroups, qualDimension, eventDimension); - } - - /** - * Merge all of the tables from toMerge into into this set of tables - */ - public void combine(final RecalibrationTables toMerge) { - if ( numTables() != toMerge.numTables() ) - throw new IllegalArgumentException("Attempting to merge RecalibrationTables with different sizes"); - - for ( int i = 0; i < numTables(); i++ ) { - final NestedIntegerArray myTable = this.getTable(i); - final NestedIntegerArray otherTable = toMerge.getTable(i); - RecalUtils.combineTables(myTable, otherTable); - } - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/ContextCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/ContextCovariate.java deleted file mode 100644 index 99264430d..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/ContextCovariate.java +++ /dev/null @@ -1,304 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration.covariates; - -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.BaseUtils; -import org.broadinstitute.gatk.utils.clipping.ClippingRepresentation; -import org.broadinstitute.gatk.utils.clipping.ReadClipper; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.recalibration.ReadCovariates; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.ArrayList; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: 9/26/11 - */ - -public class ContextCovariate implements StandardCovariate { - private final static Logger logger = Logger.getLogger(ContextCovariate.class); - - - - private int mismatchesContextSize; - private int indelsContextSize; - - private int mismatchesKeyMask; - private int indelsKeyMask; - - private static final int LENGTH_BITS = 4; - private static final int LENGTH_MASK = 15; - - // the maximum context size (number of bases) permitted; we need to keep the leftmost base free so that values are - // not negative and we reserve 4 more bits to represent the length of the context; it takes 2 bits to encode one base. - static final private int MAX_DNA_CONTEXT = 13; - private byte LOW_QUAL_TAIL; - - // Initialize any member variables using the command-line arguments passed to the walkers - @Override - public void initialize(final RecalibrationArgumentCollection RAC) { - mismatchesContextSize = RAC.MISMATCHES_CONTEXT_SIZE; - indelsContextSize = RAC.INDELS_CONTEXT_SIZE; - - logger.info("\t\tContext sizes: base substitution model " + mismatchesContextSize + ", indel substitution model " + indelsContextSize); - - if (mismatchesContextSize > MAX_DNA_CONTEXT) - throw new UserException.BadArgumentValue("mismatches_context_size", String.format("context size cannot be bigger than %d, but was %d", MAX_DNA_CONTEXT, mismatchesContextSize)); - if (indelsContextSize > MAX_DNA_CONTEXT) - throw new UserException.BadArgumentValue("indels_context_size", String.format("context size cannot be bigger than %d, but was %d", MAX_DNA_CONTEXT, indelsContextSize)); - - LOW_QUAL_TAIL = RAC.LOW_QUAL_TAIL; - - if (mismatchesContextSize <= 0 || indelsContextSize <= 0) - throw new UserException(String.format("Context size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Indels: %d", mismatchesContextSize, indelsContextSize)); - - mismatchesKeyMask = createMask(mismatchesContextSize); - indelsKeyMask = createMask(indelsContextSize); - } - - @Override - public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { - - // store the original bases and then write Ns over low quality ones - final byte[] originalBases = read.getReadBases().clone(); - // Write N's over the low quality tail of the reads to avoid adding them into the context - final GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); - - final boolean negativeStrand = clippedRead.getReadNegativeStrandFlag(); - byte[] bases = clippedRead.getReadBases(); - if (negativeStrand) - bases = BaseUtils.simpleReverseComplement(bases); - - final ArrayList mismatchKeys = contextWith(bases, mismatchesContextSize, mismatchesKeyMask); - final ArrayList indelKeys = contextWith(bases, indelsContextSize, indelsKeyMask); - - final int readLength = bases.length; - - // this is necessary to ensure that we don't keep historical data in the ReadCovariates values - // since the context covariate may not span the entire set of values in read covariates - // due to the clipping of the low quality bases - if ( readLength != originalBases.length ) { - // don't both zeroing out if we are going to overwrite the whole array - for ( int i = 0; i < originalBases.length; i++ ) - // this base has been clipped off, so zero out the covariate values here - values.addCovariate(0, 0, 0, i); - } - - for (int i = 0; i < readLength; i++) { - final int readOffset = (negativeStrand ? readLength - i - 1 : i); - final int indelKey = indelKeys.get(i); - values.addCovariate(mismatchKeys.get(i), indelKey, indelKey, readOffset); - } - - // put the original bases back in - read.setReadBases(originalBases); - } - - // Used to get the covariate's value from input csv file during on-the-fly recalibration - @Override - public final Object getValue(final String str) { - return str; - } - - @Override - public String formatKey(final int key) { - if (key == -1) // this can only happen in test routines because we do not propagate null keys to the csv file - return null; - - return contextFromKey(key); - } - - @Override - public int keyFromValue(final Object value) { - return keyFromContext((String) value); - } - - private static int createMask(final int contextSize) { - int mask = 0; - // create 2*contextSize worth of bits - for (int i = 0; i < contextSize; i++) - mask = (mask << 2) | 3; - // shift 4 bits to mask out the bits used to encode the length - return mask << LENGTH_BITS; - } - - /** - * calculates the context of a base independent of the covariate mode (mismatch, insertion or deletion) - * - * @param bases the bases in the read to build the context from - * @param contextSize context size to use building the context - * @param mask mask for pulling out just the context bits - */ - private static ArrayList contextWith(final byte[] bases, final int contextSize, final int mask) { - - final int readLength = bases.length; - final ArrayList keys = new ArrayList(readLength); - - // the first contextSize-1 bases will not have enough previous context - for (int i = 1; i < contextSize && i <= readLength; i++) - keys.add(-1); - - if (readLength < contextSize) - return keys; - - final int newBaseOffset = 2 * (contextSize - 1) + LENGTH_BITS; - - // get (and add) the key for the context starting at the first base - int currentKey = keyFromContext(bases, 0, contextSize); - keys.add(currentKey); - - // if the first key was -1 then there was an N in the context; figure out how many more consecutive contexts it affects - int currentNPenalty = 0; - if (currentKey == -1) { - currentKey = 0; - currentNPenalty = contextSize - 1; - int offset = newBaseOffset; - while (bases[currentNPenalty] != 'N') { - final int baseIndex = BaseUtils.simpleBaseToBaseIndex(bases[currentNPenalty]); - currentKey |= (baseIndex << offset); - offset -= 2; - currentNPenalty--; - } - } - - for (int currentIndex = contextSize; currentIndex < readLength; currentIndex++) { - final int baseIndex = BaseUtils.simpleBaseToBaseIndex(bases[currentIndex]); - if (baseIndex == -1) { // ignore non-ACGT bases - currentNPenalty = contextSize; - currentKey = 0; // reset the key - } else { - // push this base's contribution onto the key: shift everything 2 bits, mask out the non-context bits, and add the new base and the length in - currentKey = (currentKey >> 2) & mask; - currentKey |= (baseIndex << newBaseOffset); - currentKey |= contextSize; - } - - if (currentNPenalty == 0) { - keys.add(currentKey); - } else { - currentNPenalty--; - keys.add(-1); - } - } - - return keys; - } - - public static int keyFromContext(final String dna) { - return keyFromContext(dna.getBytes(), 0, dna.length()); - } - - /** - * Creates a int representation of a given dna string. - * - * @param dna the dna sequence - * @param start the start position in the byte array (inclusive) - * @param end the end position in the array (exclusive) - * @return the key representing the dna sequence - */ - private static int keyFromContext(final byte[] dna, final int start, final int end) { - - int key = end - start; - int bitOffset = LENGTH_BITS; - for (int i = start; i < end; i++) { - final int baseIndex = BaseUtils.simpleBaseToBaseIndex(dna[i]); - if (baseIndex == -1) // ignore non-ACGT bases - return -1; - key |= (baseIndex << bitOffset); - bitOffset += 2; - } - return key; - } - - /** - * Converts a key into the dna string representation. - * - * @param key the key representing the dna sequence - * @return the dna sequence represented by the key - */ - public static String contextFromKey(final int key) { - if (key < 0) - throw new ReviewedGATKException("dna conversion cannot handle negative numbers. Possible overflow?"); - - final int length = key & LENGTH_MASK; // the first bits represent the length (in bp) of the context - int mask = 48; // use the mask to pull out bases - int offset = LENGTH_BITS; - - StringBuilder dna = new StringBuilder(); - for (int i = 0; i < length; i++) { - final int baseIndex = (key & mask) >> offset; - dna.append((char)BaseUtils.baseIndexToSimpleBase(baseIndex)); - mask = mask << 2; // move the mask over to the next 2 bits - offset += 2; - } - - return dna.toString(); - } - - @Override - public int maximumKeyValue() { - // the maximum value is T (11 in binary) for each base in the context - int length = Math.max(mismatchesContextSize, indelsContextSize); // the length of the context - int key = length; - int bitOffset = LENGTH_BITS; - for (int i = 0; i DISCRETE_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.ILLUMINA, NGSPlatform.SOLID, NGSPlatform.PACBIO, NGSPlatform.COMPLETE_GENOMICS); - private static final EnumSet FLOW_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.LS454, NGSPlatform.ION_TORRENT); - - // Initialize any member variables using the command-line arguments passed to the walkers - @Override - public void initialize(final RecalibrationArgumentCollection RAC) { - this.MAXIMUM_CYCLE_VALUE = RAC.MAXIMUM_CYCLE_VALUE; - - if (RAC.DEFAULT_PLATFORM != null && !NGSPlatform.isKnown(RAC.DEFAULT_PLATFORM)) - throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM + ") is not a recognized platform."); - - if (RAC.DEFAULT_PLATFORM != null) - default_platform = RAC.DEFAULT_PLATFORM; - } - - // Used to pick out the covariate's value from attributes of the read - @Override - public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { - final int readLength = read.getReadLength(); - final NGSPlatform ngsPlatform = default_platform == null ? read.getNGSPlatform() : NGSPlatform.fromReadGroupPL(default_platform); - - // Discrete cycle platforms - if (DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform)) { - final int readOrderFactor = read.getReadPairedFlag() && read.getSecondOfPairFlag() ? -1 : 1; - final int increment; - int cycle; - if (read.getReadNegativeStrandFlag()) { - cycle = readLength * readOrderFactor; - increment = -1 * readOrderFactor; - } - else { - cycle = readOrderFactor; - increment = readOrderFactor; - } - - final int MAX_CYCLE_FOR_INDELS = readLength - CUSHION_FOR_INDELS - 1; - for (int i = 0; i < readLength; i++) { - final int substitutionKey = keyFromCycle(cycle); - final int indelKey = (i < CUSHION_FOR_INDELS || i > MAX_CYCLE_FOR_INDELS) ? -1 : substitutionKey; - values.addCovariate(substitutionKey, indelKey, indelKey, i); - cycle += increment; - } - } - - // Flow cycle platforms - else if (FLOW_CYCLE_PLATFORMS.contains(ngsPlatform)) { - - final byte[] bases = read.getReadBases(); - - // Differentiate between first and second of pair. - // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group - // to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair. - // Therefore the cycle covariate must differentiate between first and second of pair reads. - // This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because - // the current sequential model would consider the effects independently instead of jointly. - final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag(); - - int cycle = multiplyByNegative1 ? -1 : 1; // todo -- check if this is the right behavior for mate paired reads in flow cycle platforms. - - // BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change - // For example, AAAAAAA was probably read in two flow cycles but here we count it as one - if (!read.getReadNegativeStrandFlag()) { // Forward direction - int iii = 0; - while (iii < readLength) { - while (iii < readLength && bases[iii] == (byte) 'T') { - final int key = keyFromCycle(cycle); - values.addCovariate(key, key, key, iii); - iii++; - } - while (iii < readLength && bases[iii] == (byte) 'A') { - final int key = keyFromCycle(cycle); - values.addCovariate(key, key, key, iii); - iii++; - } - while (iii < readLength && bases[iii] == (byte) 'C') { - final int key = keyFromCycle(cycle); - values.addCovariate(key, key, key, iii); - iii++; - } - while (iii < readLength && bases[iii] == (byte) 'G') { - final int key = keyFromCycle(cycle); - values.addCovariate(key, key, key, iii); - iii++; - } - if (iii < readLength) { - if (multiplyByNegative1) - cycle--; - else - cycle++; - } - if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) { - final int key = keyFromCycle(cycle); - values.addCovariate(key, key, key, iii); - iii++; - } - - } - } - else { // Negative direction - int iii = readLength - 1; - while (iii >= 0) { - while (iii >= 0 && bases[iii] == (byte) 'T') { - final int key = keyFromCycle(cycle); - values.addCovariate(key, key, key, iii); - iii--; - } - while (iii >= 0 && bases[iii] == (byte) 'A') { - final int key = keyFromCycle(cycle); - values.addCovariate(key, key, key, iii); - iii--; - } - while (iii >= 0 && bases[iii] == (byte) 'C') { - final int key = keyFromCycle(cycle); - values.addCovariate(key, key, key, iii); - iii--; - } - while (iii >= 0 && bases[iii] == (byte) 'G') { - final int key = keyFromCycle(cycle); - values.addCovariate(key, key, key, iii); - iii--; - } - if (iii >= 0) { - if (multiplyByNegative1) - cycle--; - else - cycle++; - } - if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) { - final int key = keyFromCycle(cycle); - values.addCovariate(key, key, key, iii); - iii--; - } - } - } - } - - // Unknown platforms - else { - throw new UserException("The platform (" + read.getReadGroup().getPlatform() - + ") associated with read group " + read.getReadGroup() - + " is not a recognized platform. Allowable options are " + NGSPlatform.knownPlatformsString()); - } - } - - // Used to get the covariate's value from input csv file during on-the-fly recalibration - @Override - public final Object getValue(final String str) { - return Integer.parseInt(str); - } - - @Override - public String formatKey(final int key) { - int cycle = key >> 1; // shift so we can remove the "sign" bit - if ( (key & 1) != 0 ) // is the last bit set? - cycle *= -1; // then the cycle is negative - return String.format("%d", cycle); - } - - @Override - public int keyFromValue(final Object value) { - return (value instanceof String) ? keyFromCycle(Integer.parseInt((String) value)) : keyFromCycle((Integer) value); - } - - @Override - public int maximumKeyValue() { - return (MAXIMUM_CYCLE_VALUE << 1) + 1; - } - - private int keyFromCycle(final int cycle) { - // no negative values because values must fit into the first few bits of the long - int result = Math.abs(cycle); - if ( result > MAXIMUM_CYCLE_VALUE ) - throw new UserException("The maximum allowed value for the cycle is " + MAXIMUM_CYCLE_VALUE + ", but a larger cycle (" + result + ") was detected. Please use the --maximum_cycle_value argument to increase this value (at the expense of requiring more memory to run)"); - - result = result << 1; // shift so we can add the "sign" bit - if ( cycle < 0 ) - result++; // negative cycles get the lower-most bit set - return result; - } -} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/ExperimentalCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/ExperimentalCovariate.java deleted file mode 100644 index 771c49771..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/ExperimentalCovariate.java +++ /dev/null @@ -1,81 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration.covariates; - -/** - * [Short one sentence description of this walker] - *

    - *

    - * [Functionality of this walker] - *

    - *

    - *

    Input

    - *

    - * [Input description] - *

    - *

    - *

    Output

    - *

    - * [Output description] - *

    - *

    - *

    Examples

    - *
    - *    java
    - *      -jar GenomeAnalysisTK.jar
    - *      -T $WalkerName
    - *  
    - * - * @author Your Name - * @since Date created - */ -public interface ExperimentalCovariate extends Covariate {} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/QualityScoreCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/QualityScoreCovariate.java deleted file mode 100644 index e31588468..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/QualityScoreCovariate.java +++ /dev/null @@ -1,129 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration.covariates; - -import org.broadinstitute.gatk.utils.recalibration.ReadCovariates; -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Nov 3, 2009 - * - * The Reported Quality Score covariate. - */ - -public class QualityScoreCovariate implements RequiredCovariate { - - // Initialize any member variables using the command-line arguments passed to the walkers - @Override - public void initialize(final RecalibrationArgumentCollection RAC) {} - - @Override - public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { - final byte[] baseQualities = read.getBaseQualities(); - final byte[] baseInsertionQualities = read.getBaseInsertionQualities(); - final byte[] baseDeletionQualities = read.getBaseDeletionQualities(); - - for (int i = 0; i < baseQualities.length; i++) { - values.addCovariate((int)baseQualities[i], (int)baseInsertionQualities[i], (int)baseDeletionQualities[i], i); - } - } - - // Used to get the covariate's value from input csv file during on-the-fly recalibration - @Override - public final Object getValue(final String str) { - return Byte.parseByte(str); - } - - @Override - public String formatKey(final int key) { - return String.format("%d", key); - } - - @Override - public int keyFromValue(final Object value) { - return (value instanceof String) ? (int)Byte.parseByte((String) value) : (int)(Byte) value; - } - - @Override - public int maximumKeyValue() { - return QualityUtils.MAX_SAM_QUAL_SCORE; - } -} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/ReadGroupCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/ReadGroupCovariate.java deleted file mode 100644 index 9eadcf458..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/ReadGroupCovariate.java +++ /dev/null @@ -1,190 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration.covariates; - -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.recalibration.ReadCovariates; -import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.HashMap; -import java.util.Map; -import java.util.Set; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Oct 30, 2009 - * - * The Read Group covariate. - */ - -public class ReadGroupCovariate implements RequiredCovariate { - - private final HashMap readGroupLookupTable = new HashMap(); - private final HashMap readGroupReverseLookupTable = new HashMap(); - private int nextId = 0; - private String forceReadGroup; - - // Initialize any member variables using the command-line arguments passed to the walkers - @Override - public void initialize(final RecalibrationArgumentCollection RAC) { - forceReadGroup = RAC.FORCE_READGROUP; - } - - @Override - public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { - final String readGroupId = readGroupValueFromRG(read.getReadGroup()); - final int key = keyForReadGroup(readGroupId); - - final int l = read.getReadLength(); - for (int i = 0; i < l; i++) - values.addCovariate(key, key, key, i); - } - - @Override - public final Object getValue(final String str) { - return str; - } - - @Override - public synchronized String formatKey(final int key) { - // This method is synchronized so that we don't attempt to do a get() - // from the reverse lookup table while that table is being updated - return readGroupReverseLookupTable.get(key); - } - - @Override - public int keyFromValue(final Object value) { - return keyForReadGroup((String) value); - } - - /** - * Get the mapping from read group names to integer key values for all read groups in this covariate - * @return a set of mappings from read group names -> integer key values - */ - public Set> getKeyMap() { - return readGroupLookupTable.entrySet(); - } - - private int keyForReadGroup(final String readGroupId) { - // Rather than synchronize this entire method (which would be VERY expensive for walkers like the BQSR), - // synchronize only the table updates. - - // Before entering the synchronized block, check to see if this read group is not in our tables. - // If it's not, either we will have to insert it, OR another thread will insert it first. - // This preliminary check avoids doing any synchronization most of the time. - if ( ! readGroupLookupTable.containsKey(readGroupId) ) { - - synchronized ( this ) { - - // Now we need to make sure the key is STILL not there, since another thread may have come along - // and inserted it while we were waiting to enter this synchronized block! - if ( ! readGroupLookupTable.containsKey(readGroupId) ) { - readGroupLookupTable.put(readGroupId, nextId); - readGroupReverseLookupTable.put(nextId, readGroupId); - nextId++; - } - } - } - - return readGroupLookupTable.get(readGroupId); - } - - @Override - public synchronized int maximumKeyValue() { - // Synchronized so that we don't query table size while the tables are being updated - return readGroupLookupTable.size() - 1; - } - - /** - * If the sample has a PU tag annotation, return that. If not, return the read group id. - * - * @param rg the read group record - * @return platform unit or readgroup id - */ - private String readGroupValueFromRG(final GATKSAMReadGroupRecord rg) { - if ( forceReadGroup != null ) - return forceReadGroup; - - final String platformUnit = rg.getPlatformUnit(); - return platformUnit == null ? rg.getId() : platformUnit; - } - -} - - diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatCovariate.java deleted file mode 100644 index 1cb4be39c..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatCovariate.java +++ /dev/null @@ -1,285 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration.covariates; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.recalibration.ReadCovariates; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.gatk.utils.BaseUtils; -import org.broadinstitute.gatk.utils.collections.Pair; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; -import java.util.Set; - -public abstract class RepeatCovariate implements ExperimentalCovariate { - protected int MAX_REPEAT_LENGTH; - protected int MAX_STR_UNIT_LENGTH; - private final HashMap repeatLookupTable = new HashMap(); - private final HashMap repeatReverseLookupTable = new HashMap(); - private int nextId = 0; - - // Initialize any member variables using the command-line arguments passed to the walkers - @Override - public void initialize(final RecalibrationArgumentCollection RAC) { - MAX_STR_UNIT_LENGTH = RAC.MAX_STR_UNIT_LENGTH; - MAX_REPEAT_LENGTH = RAC.MAX_REPEAT_LENGTH; - } - - public void initialize(final int MAX_STR_UNIT_LENGTH, final int MAX_REPEAT_LENGTH) { - this.MAX_STR_UNIT_LENGTH = MAX_STR_UNIT_LENGTH; - this.MAX_REPEAT_LENGTH = MAX_REPEAT_LENGTH; - } - - @Override - public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { - // store the original bases and then write Ns over low quality ones - final byte[] originalBases = read.getReadBases().clone(); - - final boolean negativeStrand = read.getReadNegativeStrandFlag(); - byte[] bases = read.getReadBases(); - if (negativeStrand) - bases = BaseUtils.simpleReverseComplement(bases); - - // don't record reads with N's - if (!BaseUtils.isAllRegularBases(bases)) - return; - - for (int i = 0; i < bases.length; i++) { - final Pair res = findTandemRepeatUnits(bases, i); - // to merge repeat unit and repeat length to get covariate value: - final String repeatID = getCovariateValueFromUnitAndLength(res.first, res.second); - final int key = keyForRepeat(repeatID); - - final int readOffset = (negativeStrand ? bases.length - i - 1 : i); - values.addCovariate(key, key, key, readOffset); - } - - // put the original bases back in - read.setReadBases(originalBases); - - } - - public Pair findTandemRepeatUnits(byte[] readBases, int offset) { - int maxBW = 0; - byte[] bestBWRepeatUnit = new byte[]{readBases[offset]}; - for (int str = 1; str <= MAX_STR_UNIT_LENGTH; str++) { - // fix repeat unit length - //edge case: if candidate tandem repeat unit falls beyond edge of read, skip - if (offset+1-str < 0) - break; - - // get backward repeat unit and # repeats - byte[] backwardRepeatUnit = Arrays.copyOfRange(readBases, offset - str + 1, offset + 1); - maxBW = GATKVariantContextUtils.findNumberOfRepetitions(backwardRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false); - if (maxBW > 1) { - bestBWRepeatUnit = backwardRepeatUnit.clone(); - break; - } - } - byte[] bestRepeatUnit = bestBWRepeatUnit; - int maxRL = maxBW; - - if (offset < readBases.length-1) { - byte[] bestFWRepeatUnit = new byte[]{readBases[offset+1]}; - int maxFW = 0; - for (int str = 1; str <= MAX_STR_UNIT_LENGTH; str++) { - // fix repeat unit length - //edge case: if candidate tandem repeat unit falls beyond edge of read, skip - if (offset+str+1 > readBases.length) - break; - - // get forward repeat unit and # repeats - byte[] forwardRepeatUnit = Arrays.copyOfRange(readBases, offset +1, offset+str+1); - maxFW = GATKVariantContextUtils.findNumberOfRepetitions(forwardRepeatUnit, Arrays.copyOfRange(readBases, offset + 1, readBases.length), true); - if (maxFW > 1) { - bestFWRepeatUnit = forwardRepeatUnit.clone(); - break; - } - } - // if FW repeat unit = BW repeat unit it means we're in the middle of a tandem repeat - add FW and BW components - if (Arrays.equals(bestFWRepeatUnit, bestBWRepeatUnit)) { - maxRL = maxBW + maxFW; - bestRepeatUnit = bestFWRepeatUnit; // arbitrary - } - else { - // tandem repeat starting forward from current offset. - // It could be the case that best BW unit was differnet from FW unit, but that BW still contains FW unit. - // For example, TTCTT(C) CCC - at (C) place, best BW unit is (TTC)2, best FW unit is (C)3. - // but correct representation at that place might be (C)4. - // Hence, if the FW and BW units don't match, check if BW unit can still be a part of FW unit and add - // representations to total - maxBW = GATKVariantContextUtils.findNumberOfRepetitions(bestFWRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false); - maxRL = maxFW + maxBW; - bestRepeatUnit = bestFWRepeatUnit; - - } - - } - - - - if(maxRL > MAX_REPEAT_LENGTH) { maxRL = MAX_REPEAT_LENGTH; } - return new Pair(bestRepeatUnit, maxRL); - - } - @Override - public final Object getValue(final String str) { - return str; - } - - @Override - public synchronized String formatKey(final int key) { - // This method is synchronized so that we don't attempt to do a get() - // from the reverse lookup table while that table is being updated - return repeatReverseLookupTable.get(key); - } - - @Requires({"repeatLength>=0", "repeatFromUnitAndLength != null"}) - @Ensures("result != null") - protected abstract String getCovariateValueFromUnitAndLength(final byte[] repeatFromUnitAndLength, final int repeatLength); - - - @Override - public int keyFromValue(final Object value) { - return keyForRepeat((String) value); - } - - /** - * Get the mapping from read group names to integer key values for all read groups in this covariate - * @return a set of mappings from read group names -> integer key values - */ - public Set> getKeyMap() { - return repeatLookupTable.entrySet(); - } - - private int keyForRepeat(final String repeatID) { - // Rather than synchronize this entire method (which would be VERY expensive for walkers like the BQSR), - // synchronize only the table updates. - - // Before entering the synchronized block, check to see if this read group is not in our tables. - // If it's not, either we will have to insert it, OR another thread will insert it first. - // This preliminary check avoids doing any synchronization most of the time. - if ( ! repeatLookupTable.containsKey(repeatID) ) { - - synchronized ( this ) { - - // Now we need to make sure the key is STILL not there, since another thread may have come along - // and inserted it while we were waiting to enter this synchronized block! - if ( ! repeatLookupTable.containsKey(repeatID) ) { - repeatLookupTable.put(repeatID, nextId); - repeatReverseLookupTable.put(nextId, repeatID); - nextId++; - } - } - } - - return repeatLookupTable.get(repeatID); - } - - - /** - * Splits repeat unit and num repetitions from covariate value. - * For example, if value if "ATG4" it returns (ATG,4) - * @param value Covariate value - * @return Split pair - */ - @Requires("value != null") - @Ensures({"result.first != null","result.second>=0"}) - public static Pair getRUandNRfromCovariate(final String value) { - - int k = 0; - for ( k=0; k < value.length(); k++ ) { - if (!BaseUtils.isRegularBase(value.getBytes()[k])) - break; - } - Integer nr = Integer.valueOf(value.substring(k,value.length())); // will throw NumberFormatException if format illegal - if (k == value.length() || nr <= 0) - throw new IllegalStateException("Covariate is not of form (Repeat Unit) + Integer"); - - return new Pair(value.substring(0,k), nr); - } - - /** - * Gets bases from tandem repeat representation (Repeat Unit),(Number of Repeats). - * For example, (AGC),3 returns AGCAGCAGC - * @param repeatUnit Tandem repeat unit - * @param numRepeats Number of repeats - * @return Expanded String - */ - @Requires({"numRepeats > 0","repeatUnit != null"}) - @Ensures("result != null") - public static String getBasesFromRUandNR(final String repeatUnit, final int numRepeats) { - final StringBuilder sb = new StringBuilder(); - - for (int i=0; i < numRepeats; i++) - sb.append(repeatUnit); - - return sb.toString(); - } - - // version given covariate key - public static String getBasesFromRUandNR(final String covariateValue) { - Pair pair = getRUandNRfromCovariate(covariateValue); - return getBasesFromRUandNR(pair.getFirst(), pair.getSecond()); - } - - @Override - public abstract int maximumKeyValue(); - - - -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatLengthCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatLengthCovariate.java deleted file mode 100644 index 398633062..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatLengthCovariate.java +++ /dev/null @@ -1,74 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration.covariates; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; - -public class RepeatLengthCovariate extends RepeatCovariate { - - @Requires({"repeatLength>=0", "repeatFromUnitAndLength != null"}) - @Ensures("result != null") - protected String getCovariateValueFromUnitAndLength(final byte[] repeatFromUnitAndLength, final int repeatLength) { - return String.format("%d",repeatLength); - } - - @Override - public synchronized int maximumKeyValue() { - // Synchronized so that we don't query table size while the tables are being updated - //return repeatLookupTable.size() - 1; - // max possible values of covariate: for repeat unit, length is up to MAX_STR_UNIT_LENGTH, - // so we have 4^MAX_STR_UNIT_LENGTH * MAX_REPEAT_LENGTH possible values - return (1+MAX_REPEAT_LENGTH); - } - -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatUnitAndLengthCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatUnitAndLengthCovariate.java deleted file mode 100644 index 345ef0d7d..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatUnitAndLengthCovariate.java +++ /dev/null @@ -1,75 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration.covariates; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; - - -public class RepeatUnitAndLengthCovariate extends RepeatCovariate { - - @Requires({"repeatLength>=0", "repeatFromUnitAndLength != null"}) - @Ensures("result != null") - protected String getCovariateValueFromUnitAndLength(final byte[] repeatFromUnitAndLength, final int repeatLength) { - return new String(repeatFromUnitAndLength) + String.format("%d",repeatLength); - } - - @Override - public synchronized int maximumKeyValue() { - // Synchronized so that we don't query table size while the tables are being updated - //return repeatLookupTable.size() - 1; - // max possible values of covariate: for repeat unit, length is up to MAX_STR_UNIT_LENGTH, - // so we have 4^MAX_STR_UNIT_LENGTH * MAX_REPEAT_LENGTH possible values - return (1<<(2*MAX_STR_UNIT_LENGTH)) * MAX_REPEAT_LENGTH +1; - } - -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatUnitCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatUnitCovariate.java deleted file mode 100644 index b1b0ca457..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatUnitCovariate.java +++ /dev/null @@ -1,78 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration.covariates; - -/** - * Created with IntelliJ IDEA. - * User: rpoplin - * Date: 11/3/12 - */ - -public class RepeatUnitCovariate extends RepeatCovariate { - - protected String getCovariateValueFromUnitAndLength(final byte[] repeatFromUnitAndLength, final int repeatLength) { - return new String(repeatFromUnitAndLength); - - } - - - @Override - public synchronized int maximumKeyValue() { - // Synchronized so that we don't query table size while the tables are being updated - //return repeatLookupTable.size() - 1; - // max possible values of covariate: for repeat unit, length is up to MAX_STR_UNIT_LENGTH, - // so we have 4^MAX_STR_UNIT_LENGTH * MAX_REPEAT_LENGTH possible values - return (1<<(2*MAX_STR_UNIT_LENGTH)) +1; - } - - -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RequiredCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RequiredCovariate.java deleted file mode 100644 index e30df7dd2..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RequiredCovariate.java +++ /dev/null @@ -1,81 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration.covariates; - -/** - * [Short one sentence description of this walker] - *

    - *

    - * [Functionality of this walker] - *

    - *

    - *

    Input

    - *

    - * [Input description] - *

    - *

    - *

    Output

    - *

    - * [Output description] - *

    - *

    - *

    Examples

    - *
    - *    java
    - *      -jar GenomeAnalysisTK.jar
    - *      -T $WalkerName
    - *  
    - * - * @author Your Name - * @since Date created - */ -public interface RequiredCovariate extends Covariate {} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/StandardCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/StandardCovariate.java deleted file mode 100644 index 4e40f7d49..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/StandardCovariate.java +++ /dev/null @@ -1,81 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration.covariates; - -/** - * [Short one sentence description of this walker] - *

    - *

    - * [Functionality of this walker] - *

    - *

    - *

    Input

    - *

    - * [Input description] - *

    - *

    - *

    Output

    - *

    - * [Output description] - *

    - *

    - *

    Examples

    - *
    - *    java
    - *      -jar GenomeAnalysisTK.jar
    - *      -T $WalkerName
    - *  
    - * - * @author Your Name - * @since Date created - */ -public interface StandardCovariate extends Covariate {} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/sam/ClippedGATKSAMRecord.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/sam/ClippedGATKSAMRecord.java index 3baa98018..5f6ee422d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/sam/ClippedGATKSAMRecord.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/sam/ClippedGATKSAMRecord.java @@ -51,6 +51,8 @@ package org.broadinstitute.gatk.utils.sam; +import htsjdk.samtools.GATKBin; + import java.util.Arrays; /** @@ -69,15 +71,30 @@ public class ClippedGATKSAMRecord extends GATKSAMRecord { * @param end inclusive last position in {@code read} included in the clipped view. */ public ClippedGATKSAMRecord(final GATKSAMRecord read, int start, int end) { - super(read.getHeader(), read.getReferenceIndex(), read.getAlignmentStart() + start, (short) read.getReadNameLength(), - (short) 100, -1, read.getCigarLength(), read.getFlags(), end - start, - read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getInferredInsertSize(), - new byte[0]); + super(read.getHeader()); + this.setReferenceIndex(read.getReferenceIndex()); + this.setAlignmentStart(read.getAlignmentStart() + start); + this.setMappingQuality(100); + // setting read indexing bin below + this.setFlags(read.getFlags()); + this.setMateReferenceIndex(read.getMateReferenceIndex()); + this.setMateAlignmentStart(read.getMateAlignmentStart()); + this.setInferredInsertSize(read.getInferredInsertSize()); this.setReadBases(Arrays.copyOfRange(read.getReadBases(), start, end)); this.setBaseQualities(Arrays.copyOfRange(read.getBaseQualities(),start,end)); this.setReadName(read.getReadName()); insertionQuals = Arrays.copyOfRange(read.getBaseInsertionQualities(),start,end); deletionQuals = Arrays.copyOfRange(read.getBaseDeletionQualities(),start,end); + + // Set these to null in order to mark them as being candidates for lazy initialization. + // If this is not done, they will have non-null defaults. + super.setReadName(null); + super.setCigarString(null); + super.setReadBases(null); + super.setBaseQualities(null); + + // Do this after the above because setCigarString will clear it. + GATKBin.setReadIndexingBin(this, -1); } @Override diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/variant/ReferenceConfidenceVariantContextMerger.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/variant/ReferenceConfidenceVariantContextMerger.java deleted file mode 100644 index 6676650c8..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/variant/ReferenceConfidenceVariantContextMerger.java +++ /dev/null @@ -1,417 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.variant; - -import htsjdk.variant.variantcontext.*; -import htsjdk.variant.vcf.VCFConstants; -import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculators; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.util.*; - -/** - * Variant context utilities related to merging variant-context instances. - * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> - */ -public class ReferenceConfidenceVariantContextMerger { - - private static Comparable combineAnnotationValues( final List array ) { - return MathUtils.median(array); // right now we take the median but other options could be explored - } - - /** - * Merges VariantContexts from gVCFs into a single hybrid. - * Assumes that none of the input records are filtered. - * - * @param VCs collection of unsorted genomic VCs - * @param loc the current location - * @param refBase the reference allele to use if all contexts in the VC are spanning (i.e. don't start at the location in loc); if null, we'll return null in this case - * @param removeNonRefSymbolicAllele if true, remove the allele from the merged VC - * @return new VariantContext representing the merge of all VCs or null if it not relevant - */ - public static VariantContext merge(final List VCs, final GenomeLoc loc, final Byte refBase, final boolean removeNonRefSymbolicAllele) { - // this can happen if e.g. you are using a dbSNP file that spans a region with no gVCFs - if ( VCs == null || VCs.size() == 0 ) - return null; - - // establish the baseline info (sometimes from the first VC) - final VariantContext first = VCs.get(0); - final String name = first.getSource(); - - // ref allele - final Allele refAllele = determineReferenceAlleleGivenReferenceBase(VCs, loc, refBase); - if ( refAllele == null ) - return null; - - // FinalAlleleSet contains the alleles of the new resulting VC - // Using linked set in order to guarantee a stable order - final LinkedHashSet finalAlleleSet = new LinkedHashSet<>(10); - // Reference goes first - finalAlleleSet.add(refAllele); - - final Map attributes = new LinkedHashMap<>(); - final Set rsIDs = new LinkedHashSet<>(1); // most of the time there's one id - int depth = 0; - final Map> annotationMap = new LinkedHashMap<>(); - final GenotypesContext genotypes = GenotypesContext.create(); - - final int variantContextCount = VCs.size(); - // In this list we hold the mapping of each variant context alleles. - final List>> vcAndNewAllelePairs = new ArrayList<>(variantContextCount); - // cycle through and add info from the other VCs - for ( final VariantContext vc : VCs ) { - - // if this context doesn't start at the current location then it must be a spanning event (deletion or ref block) - final boolean isSpanningEvent = loc.getStart() != vc.getStart(); - - vcAndNewAllelePairs.add(new Pair<>(vc,isSpanningEvent ? replaceWithNoCalls(vc.getAlleles()) - : remapAlleles(vc.getAlleles(), refAllele, finalAlleleSet))); - } - - // Add to the end if at all required in in the output. - if (!removeNonRefSymbolicAllele) finalAlleleSet.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - - final List allelesList = new ArrayList<>(finalAlleleSet); - - for ( final Pair> pair : vcAndNewAllelePairs ) { - final VariantContext vc = pair.getFirst(); - final List remappedAlleles = pair.getSecond(); - - mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList); - - // special case DP (add it up) for all events - if ( vc.hasAttribute(VCFConstants.DEPTH_KEY) ) { - depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); - } else { // handle the gVCF case from the HaplotypeCaller - for( final Genotype gt : vc.getGenotypes() ) { - depth += (gt.hasExtendedAttribute("MIN_DP") ? Integer.parseInt((String)gt.getAnyAttribute("MIN_DP")) : (gt.hasDP() ? gt.getDP() : 0)); - } - } - - if ( loc.getStart() != vc.getStart() ) - continue; - - // special case ID (just preserve it) - if ( vc.hasID() ) rsIDs.add(vc.getID()); - - // add attributes - addReferenceConfidenceAttributes(vc.getAttributes(), annotationMap); - } - - // when combining annotations use the median value from all input VCs which had annotations provided - for ( final Map.Entry> p : annotationMap.entrySet() ) { - if ( ! p.getValue().isEmpty() ) { - attributes.put(p.getKey(), combineAnnotationValues(p.getValue())); - } - } - - if ( depth > 0 ) - attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); - - // remove stale AC and AF based attributes - removeStaleAttributesAfterMerge(attributes); - - final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); - - final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID).alleles(allelesList) - .chr(loc.getContig()).start(loc.getStart()).computeEndFromAlleles(allelesList, loc.getStart(), loc.getStart()) - .genotypes(genotypes).unfiltered().attributes(new TreeMap<>(attributes)).log10PError(CommonInfo.NO_LOG10_PERROR); // we will need to re-genotype later - - return builder.make(); - } - - /** - * Determines the ref allele given the provided reference base at this position - * - * @param VCs collection of unsorted genomic VCs - * @param loc the current location - * @param refBase the reference allele to use if all contexts in the VC are spanning - * @return new Allele or null if no reference allele/base is available - */ - private static Allele determineReferenceAlleleGivenReferenceBase(final List VCs, final GenomeLoc loc, final Byte refBase) { - final Allele refAllele = GATKVariantContextUtils.determineReferenceAllele(VCs, loc); - if ( refAllele == null ) - return ( refBase == null ? null : Allele.create(refBase, true) ); - return refAllele; - } - - /** - * Remove the stale attributes from the merged set - * - * @param attributes the attribute map - */ - private static void removeStaleAttributesAfterMerge(final Map attributes) { - attributes.remove(VCFConstants.ALLELE_COUNT_KEY); - attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY); - attributes.remove(VCFConstants.ALLELE_NUMBER_KEY); - attributes.remove(VCFConstants.MLE_ALLELE_COUNT_KEY); - attributes.remove(VCFConstants.MLE_ALLELE_FREQUENCY_KEY); - attributes.remove(VCFConstants.END_KEY); - } - - /** - * Adds attributes to the global map from the new context in a sophisticated manner - * - * @param myAttributes attributes to add from - * @param annotationMap map of annotations for combining later - */ - private static void addReferenceConfidenceAttributes(final Map myAttributes, - final Map> annotationMap) { - for ( final Map.Entry p : myAttributes.entrySet() ) { - final String key = p.getKey(); - final Object value = p.getValue(); - - // add the annotation values to a list for combining later - List values = annotationMap.get(key); - if( values == null ) { - values = new ArrayList<>(); - annotationMap.put(key, values); - } - try { - final String stringValue = value.toString(); - // Branch to avoid unintentional, implicit type conversions that occur with the ? operator. - if (stringValue.contains(".")) - values.add(Double.parseDouble(stringValue)); - else - values.add(Integer.parseInt(stringValue)); - } catch (final NumberFormatException e) { - // nothing to do - } - } - } - - /** - * This method does a couple of things: - *
    • - * remaps the vc alleles considering the differences between the final reference allele and its own reference,
    • - *
    • - * collects alternative alleles present in variant context and add them to the {@code finalAlleles} set. - *
    - * - * @param vcAlleles the variant context allele list. - * @param refAllele final reference allele. - * @param finalAlleles where to add the final set of non-ref called alleles. - * @return never {@code null} - */ - //TODO as part of a larger refactoring effort {@link #remapAlleles} can be merged with {@link GATKVariantContextUtils#remapAlleles}. - private static List remapAlleles(final List vcAlleles, final Allele refAllele, final LinkedHashSet finalAlleles) { - final Allele vcRef = vcAlleles.get(0); - if (!vcRef.isReference()) throw new IllegalStateException("the first allele of the vc allele list must be reference"); - final byte[] refBases = refAllele.getBases(); - final int extraBaseCount = refBases.length - vcRef.getBases().length; - if (extraBaseCount < 0) throw new IllegalStateException("the wrong reference was selected"); - final List result = new ArrayList<>(vcAlleles.size()); - - for (final Allele a : vcAlleles) { - if (a.isReference()) { - result.add(refAllele); - } else if (a.isSymbolic()) { - result.add(a); - // we always skip when adding to finalAlleles this is done outside if applies. - if (!a.equals(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE)) - finalAlleles.add(a); - } else if (a.isCalled()) { - final Allele newAllele; - if (extraBaseCount > 0) { - final byte[] oldBases = a.getBases(); - final byte[] newBases = Arrays.copyOf(oldBases,oldBases.length + extraBaseCount); - System.arraycopy(refBases,refBases.length - extraBaseCount,newBases,oldBases.length,extraBaseCount); - newAllele = Allele.create(newBases,false); - } else - newAllele = a; - result.add(newAllele); - finalAlleles.add(newAllele); - } else { // NO_CALL and strange miscellanea - result.add(a); - } - } - return result; - } - - /** - * Replaces any alleles in the list with NO CALLS, except for the generic ALT allele - * - * @param alleles list of alleles to replace - * @return non-null list of alleles - */ - private static List replaceWithNoCalls(final List alleles) { - if ( alleles == null ) throw new IllegalArgumentException("list of alleles cannot be null"); - - final List result = new ArrayList<>(alleles.size()); - for ( final Allele allele : alleles ) - result.add(allele.equals(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) ? allele : Allele.NO_CALL); - return result; - } - - /** - * Merge into the context a new genotype represented by the given VariantContext for the provided list of target alleles. - * This method assumes that none of the alleles in the VC overlaps with any of the alleles in the set. - * - * @param mergedGenotypes the genotypes context to add to - * @param VC the Variant Context for the sample - * @param remappedAlleles the list of remapped alleles for the sample - * @param targetAlleles the list of target alleles - */ - private static void mergeRefConfidenceGenotypes(final GenotypesContext mergedGenotypes, - final VariantContext VC, - final List remappedAlleles, - final List targetAlleles) { - final int maximumPloidy = VC.getMaxPloidy(GATKVariantContextUtils.DEFAULT_PLOIDY); - // the map is different depending on the ploidy, so in order to keep this method flexible (mixed ploidies) - // we need to get a map done (lazily inside the loop) for each ploidy, up to the maximum possible. - final int[][] genotypeIndexMapsByPloidy = new int[maximumPloidy + 1][]; - final int maximumAlleleCount = Math.max(remappedAlleles.size(),targetAlleles.size()); - final int[] indexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, VC.getStart()); - - for ( final Genotype g : VC.getGenotypes() ) { - final String name = g.getSampleName(); - if ( mergedGenotypes.containsSample(name) ) - continue; - final int ploidy = g.getPloidy(); - final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy())); - if (g.hasPL()) { - // lazy initialization of the genotype index map by ploidy. - final int[] genotypeIndexMapByPloidy = genotypeIndexMapsByPloidy[ploidy] == null - ? GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(indexesOfRelevantAlleles) - : genotypeIndexMapsByPloidy[ploidy]; - final int[] PLs = generatePL(g, genotypeIndexMapByPloidy); - final int[] AD = g.hasAD() ? generateAD(g.getAD(), indexesOfRelevantAlleles) : null; - genotypeBuilder.PL(PLs).AD(AD).noGQ(); - } - mergedGenotypes.add(genotypeBuilder.make()); - } - } - - /** - * Composes a new likelihood array given the original genotype and the genotype index map. - * - * @param g the original genotype. - * @param genotypeIndexMapByPloidy genotype index map. The ith element indicates what genotype in {@code g} corresponds - * to the ith genotype in the return likelihoods array. - * - * @throws NullPointerException if {@code g} or {@code genotypeIndexMapByPloidy} is {@code null}, or if {@code g} - * does not contain likelihoods. - * @throws IndexOutOfBoundsException if {@code genotypeIndexMapByPloidy} contain non valid - * genotype indices given the likelihood array in {@code g}. - * - * @return never {@code null} but an array of exactly {@code genotypeIndexMapByPloidy.length} positions. - */ - private static int[] generatePL(final Genotype g, final int[] genotypeIndexMapByPloidy) { - final int[] PLs = new int[genotypeIndexMapByPloidy.length]; - final int[] oldPLs = g.getPL(); - for (int i = 0; i < PLs.length; i++) - PLs[i] = oldPLs[genotypeIndexMapByPloidy[i]]; - return PLs; - } - - /** - * Determines the allele mapping from myAlleles to the targetAlleles, substituting the generic "" as appropriate. - * If the myAlleles set does not contain "" as an allele, it throws an exception. - * - * @param remappedAlleles the list of alleles to evaluate - * @param targetAlleles the target list of alleles - * @param position position to use for error messages - * @return non-null array of ints representing indexes - */ - protected static int[] getIndexesOfRelevantAlleles(final List remappedAlleles, final List targetAlleles, final int position) { - - if ( remappedAlleles == null || remappedAlleles.size() == 0 ) throw new IllegalArgumentException("The list of input alleles must not be null or empty"); - if ( targetAlleles == null || targetAlleles.size() == 0 ) throw new IllegalArgumentException("The list of target alleles must not be null or empty"); - - if ( !remappedAlleles.contains(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) ) - throw new UserException("The list of input alleles must contain " + GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE + " as an allele but that is not the case at position " + position + "; please use the Haplotype Caller with gVCF output to generate appropriate records"); - final int indexOfGenericAlt = remappedAlleles.indexOf(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - - final int[] indexMapping = new int[targetAlleles.size()]; - - // the reference alleles always match up (even if they don't appear to) - indexMapping[0] = 0; - - // create the index mapping, using the allele whenever such a mapping doesn't exist - for ( int i = 1; i < targetAlleles.size(); i++ ) { - final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i)); - indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfGenericAlt : indexOfRemappedAllele; - } - - return indexMapping; - } - - /** - * Generates a new AD array by adding zeros for missing alleles given the set of indexes of the Genotype's current - * alleles from the original AD. - * - * @param originalAD the original AD to extend - * @param indexesOfRelevantAlleles the indexes of the original alleles corresponding to the new alleles - * @return non-null array of new AD values - */ - protected static int[] generateAD(final int[] originalAD, final int[] indexesOfRelevantAlleles) { - if ( originalAD == null || indexesOfRelevantAlleles == null ) throw new IllegalArgumentException("The list of input AD values and alleles must not be null"); - - final int numADs = indexesOfRelevantAlleles.length; - final int[] newAD = new int[numADs]; - - for ( int i = 0; i < numADs; i++ ) { - final int oldIndex = indexesOfRelevantAlleles[i]; - if ( oldIndex >= originalAD.length ) - newAD[i] = 0; - else - newAD[i] = originalAD[oldIndex]; - } - - return newAD; - } -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/ContextCovariateUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/ContextCovariateUnitTest.java new file mode 100644 index 000000000..25748f70e --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/ContextCovariateUnitTest.java @@ -0,0 +1,121 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.engine.recalibration.covariates.ContextCovariate; +import org.broadinstitute.gatk.engine.recalibration.covariates.Covariate; +import org.broadinstitute.gatk.utils.clipping.ClippingRepresentation; +import org.broadinstitute.gatk.utils.clipping.ReadClipper; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +/** + * @author Mauricio Carneiro + * @since 3/1/12 + */ +public class ContextCovariateUnitTest { + ContextCovariate covariate; + RecalibrationArgumentCollection RAC; + + @BeforeClass + public void init() { + RAC = new RecalibrationArgumentCollection(); + covariate = new ContextCovariate(); + covariate.initialize(RAC); + } + + @BeforeMethod + public void initCache() { + ReadCovariates.clearKeysCache(); + } + + @Test(enabled = true) + public void testSimpleContexts() { + GATKSAMRecord read = ReadUtils.createRandomRead(1000); + GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, RAC.LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + + verifyCovariateArray(readCovariates.getMismatchesKeySet(), RAC.MISMATCHES_CONTEXT_SIZE, clippedRead, covariate); + verifyCovariateArray(readCovariates.getInsertionsKeySet(), RAC.INDELS_CONTEXT_SIZE, clippedRead, covariate); + verifyCovariateArray(readCovariates.getDeletionsKeySet(), RAC.INDELS_CONTEXT_SIZE, clippedRead, covariate); + } + + public static void verifyCovariateArray(int[][] values, int contextSize, GATKSAMRecord read, Covariate contextCovariate) { + for (int i = 0; i < values.length; i++) + Assert.assertEquals(contextCovariate.formatKey(values[i][0]), expectedContext(read, i, contextSize)); + + } + + public static String expectedContext (GATKSAMRecord read, int offset, int contextSize) { + final String bases = stringFrom(read.getReadBases()); + String expectedContext = null; + if (offset - contextSize + 1 >= 0) { + String context = bases.substring(offset - contextSize + 1, offset + 1); + if (!context.contains("N")) + expectedContext = context; + } + return expectedContext; + } + + private static String stringFrom(byte[] array) { + String s = ""; + for (byte value : array) + s += (char) value; + return s; + } + +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/CycleCovariateUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/CycleCovariateUnitTest.java new file mode 100644 index 000000000..f40152e94 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/CycleCovariateUnitTest.java @@ -0,0 +1,140 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.engine.recalibration.covariates.CycleCovariate; +import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +/** + * @author Mauricio Carneiro + * @since 3/1/12 + */ +public class CycleCovariateUnitTest { + CycleCovariate covariate; + RecalibrationArgumentCollection RAC; + + @BeforeClass + public void init() { + RAC = new RecalibrationArgumentCollection(); + covariate = new CycleCovariate(); + covariate.initialize(RAC); + } + + @BeforeMethod + public void initCache() { + ReadCovariates.clearKeysCache(); + } + + @Test(enabled = true) + public void testSimpleCycles() { + short readLength = 10; + GATKSAMRecord read = ReadUtils.createRandomRead(readLength); + read.setReadPairedFlag(true); + read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); + read.getReadGroup().setPlatform("illumina"); + + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + verifyCovariateArray(readCovariates.getMismatchesKeySet(), 1, (short) 1); + + read.setReadNegativeStrandFlag(true); + covariate.recordValues(read, readCovariates); + verifyCovariateArray(readCovariates.getMismatchesKeySet(), readLength, -1); + + read.setSecondOfPairFlag(true); + covariate.recordValues(read, readCovariates); + verifyCovariateArray(readCovariates.getMismatchesKeySet(), -readLength, 1); + + read.setReadNegativeStrandFlag(false); + covariate.recordValues(read, readCovariates); + verifyCovariateArray(readCovariates.getMismatchesKeySet(), -1, -1); + } + + private void verifyCovariateArray(int[][] values, int init, int increment) { + for (short i = 0; i < values.length; i++) { + short actual = Short.decode(covariate.formatKey(values[i][0])); + int expected = init + (increment * i); + Assert.assertEquals(actual, expected); + } + } + + @Test(enabled = true, expectedExceptions={UserException.class}) + public void testMoreThanMaxCycleFails() { + int readLength = RAC.MAXIMUM_CYCLE_VALUE + 1; + GATKSAMRecord read = ReadUtils.createRandomRead(readLength); + read.setReadPairedFlag(true); + read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); + read.getReadGroup().setPlatform("illumina"); + + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + } + + @Test(enabled = true) + public void testMaxCyclePasses() { + int readLength = RAC.MAXIMUM_CYCLE_VALUE; + GATKSAMRecord read = ReadUtils.createRandomRead(readLength); + read.setReadPairedFlag(true); + read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); + read.getReadGroup().setPlatform("illumina"); + + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + } +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/QualQuantizerUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/QualQuantizerUnitTest.java new file mode 100644 index 000000000..b8d5c5303 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/QualQuantizerUnitTest.java @@ -0,0 +1,195 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + + +// the imports for unit testing. + + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.Utils; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + + +public class QualQuantizerUnitTest extends BaseTest { + @BeforeSuite + public void before() { + + } + + // -------------------------------------------------------------------------------- + // + // merge case Provider + // + // -------------------------------------------------------------------------------- + + private class QualIntervalTestProvider extends TestDataProvider { + final QualQuantizer.QualInterval left, right; + int exError, exTotal, exQual; + double exErrorRate; + + private QualIntervalTestProvider(int leftE, int leftN, int rightE, int rightN, int exError, int exTotal) { + super(QualIntervalTestProvider.class); + + QualQuantizer qq = new QualQuantizer(0); + left = qq.new QualInterval(10, 10, leftN, leftE, 0); + right = qq.new QualInterval(11, 11, rightN, rightE, 0); + + this.exError = exError; + this.exTotal = exTotal; + this.exErrorRate = (leftE + rightE + 1) / (1.0 * (leftN + rightN + 1)); + this.exQual = QualityUtils.errorProbToQual(this.exErrorRate); + } + } + + @DataProvider(name = "QualIntervalTestProvider") + public Object[][] makeQualIntervalTestProvider() { + new QualIntervalTestProvider(10, 100, 10, 1000, 20, 1100); + new QualIntervalTestProvider(0, 100, 10, 900, 10, 1000); + new QualIntervalTestProvider(10, 900, 0, 100, 10, 1000); + new QualIntervalTestProvider(0, 0, 10, 100, 10, 100); + new QualIntervalTestProvider(1, 10, 9, 90, 10, 100); + new QualIntervalTestProvider(1, 10, 9, 100000, 10, 100010); + new QualIntervalTestProvider(1, 10, 9, 1000000, 10,1000010); + + return QualIntervalTestProvider.getTests(QualIntervalTestProvider.class); + } + + @Test(dataProvider = "QualIntervalTestProvider") + public void testQualInterval(QualIntervalTestProvider cfg) { + QualQuantizer.QualInterval merged = cfg.left.merge(cfg.right); + Assert.assertEquals(merged.nErrors, cfg.exError); + Assert.assertEquals(merged.nObservations, cfg.exTotal); + Assert.assertEquals(merged.getErrorRate(), cfg.exErrorRate); + Assert.assertEquals(merged.getQual(), cfg.exQual); + } + + @Test + public void testMinInterestingQual() { + for ( int q = 0; q < 15; q++ ) { + for ( int minQual = 0; minQual <= 10; minQual ++ ) { + QualQuantizer qq = new QualQuantizer(minQual); + QualQuantizer.QualInterval left = qq.new QualInterval(q, q, 100, 10, 0); + QualQuantizer.QualInterval right = qq.new QualInterval(q+1, q+1, 1000, 100, 0); + + QualQuantizer.QualInterval merged = left.merge(right); + boolean shouldBeFree = q+1 <= minQual; + if ( shouldBeFree ) + Assert.assertEquals(merged.getPenalty(), 0.0); + else + Assert.assertTrue(merged.getPenalty() > 0.0); + } + } + } + + + // -------------------------------------------------------------------------------- + // + // High-level case Provider + // + // -------------------------------------------------------------------------------- + + private class QuantizerTestProvider extends TestDataProvider { + final List nObservationsPerQual = new ArrayList(); + final int nLevels; + final List expectedMap; + + private QuantizerTestProvider(final List nObservationsPerQual, final int nLevels, final List expectedMap) { + super(QuantizerTestProvider.class); + + for ( int x : nObservationsPerQual ) + this.nObservationsPerQual.add((long)x); + this.nLevels = nLevels; + this.expectedMap = expectedMap; + } + + @Override + public String toString() { + return String.format("QQTest nLevels=%d nObs=[%s] map=[%s]", + nLevels, Utils.join(",", nObservationsPerQual), Utils.join(",", expectedMap)); + } + } + + @DataProvider(name = "QuantizerTestProvider") + public Object[][] makeQuantizerTestProvider() { + List allQ2 = Arrays.asList(0, 0, 1000, 0, 0); + + new QuantizerTestProvider(allQ2, 5, Arrays.asList(0, 1, 2, 3, 4)); + new QuantizerTestProvider(allQ2, 1, Arrays.asList(2, 2, 2, 2, 2)); + + new QuantizerTestProvider(Arrays.asList(0, 0, 1000, 0, 1000), 2, Arrays.asList(2, 2, 2, 2, 4)); + new QuantizerTestProvider(Arrays.asList(0, 0, 1000, 1, 1000), 2, Arrays.asList(2, 2, 2, 4, 4)); + new QuantizerTestProvider(Arrays.asList(0, 0, 1000, 10, 1000), 2, Arrays.asList(2, 2, 2, 2, 4)); + + return QuantizerTestProvider.getTests(QuantizerTestProvider.class); + } + + @Test(dataProvider = "QuantizerTestProvider", enabled = true) + public void testQuantizer(QuantizerTestProvider cfg) { + QualQuantizer qq = new QualQuantizer(cfg.nObservationsPerQual, cfg.nLevels, 0); + logger.warn("cfg: " + cfg); + for ( int i = 0; i < cfg.expectedMap.size(); i++) { + int expected = cfg.expectedMap.get(i); + int observed = qq.originalToQuantizedMap.get(i); + //logger.warn(String.format(" qq map: %s : %d => %d", i, expected, observed)); + Assert.assertEquals(observed, expected); + } + } +} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/ReadCovariatesUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/ReadCovariatesUnitTest.java new file mode 100644 index 000000000..f263345e7 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/ReadCovariatesUnitTest.java @@ -0,0 +1,148 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.engine.recalibration.covariates.*; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.recalibration.EventType; +import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.util.Random; + +/** + * @author carneiro + * @since 4/21/12 + */ +public class ReadCovariatesUnitTest { + + @BeforeMethod + public void init() { + ReadCovariates.clearKeysCache(); + } + + @Test(enabled = false) + public void testCovariateGeneration() { + final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + final String RGID = "id"; + + ReadGroupCovariate rgCov = new ReadGroupCovariate(); + QualityScoreCovariate qsCov = new QualityScoreCovariate(); + ContextCovariate coCov = new ContextCovariate(); + CycleCovariate cyCov = new CycleCovariate(); + + rgCov.initialize(RAC); + qsCov.initialize(RAC); + coCov.initialize(RAC); + cyCov.initialize(RAC); + + Covariate[] requestedCovariates = new Covariate[4]; + requestedCovariates[0] = rgCov; + requestedCovariates[1] = qsCov; + requestedCovariates[2] = coCov; + requestedCovariates[3] = cyCov; + + final int NUM_READS = 100; + final Random rnd = Utils.getRandomGenerator(); + + final String[] readGroups = {"RG1", "RG2", "RGbla"}; + for (int idx = 0; idx < NUM_READS; idx++) { + for (final String rgs : readGroups) { + final int length = 10 + rnd.nextInt(100); // random read length, at least 10 bp long + final GATKSAMRecord read = ReadUtils.createRandomRead(length, false); + final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(rgs); + rg.setPlatform("illumina"); + read.setReadGroup(rg); + read.setReadNegativeStrandFlag(rnd.nextBoolean()); + final byte[] mQuals = read.getBaseQualities(EventType.BASE_SUBSTITUTION); + final byte[] iQuals = read.getBaseQualities(EventType.BASE_INSERTION); + final byte[] dQuals = read.getBaseQualities(EventType.BASE_DELETION); + ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); + + // check that the length is correct + Assert.assertEquals(rc.getMismatchesKeySet().length, length); + Assert.assertEquals(rc.getInsertionsKeySet().length, length); + Assert.assertEquals(rc.getDeletionsKeySet().length, length); + + for (int i = 0; i < length; i++) { + // check that read group is always the same + Assert.assertEquals(rgCov.formatKey(rc.getMismatchesKeySet(i)[0]), rgs); + Assert.assertEquals(rgCov.formatKey(rc.getInsertionsKeySet(i)[0]), rgs); + Assert.assertEquals(rgCov.formatKey(rc.getDeletionsKeySet(i)[0]), rgs); + + // check quality score + Assert.assertEquals(qsCov.formatKey(rc.getMismatchesKeySet(i)[1]), "" + mQuals[i]); + Assert.assertEquals(qsCov.formatKey(rc.getInsertionsKeySet(i)[1]), "" + iQuals[i]); + Assert.assertEquals(qsCov.formatKey(rc.getDeletionsKeySet(i)[1]), "" + dQuals[i]); + + // check context + Assert.assertEquals(coCov.formatKey(rc.getMismatchesKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.MISMATCHES_CONTEXT_SIZE)); + Assert.assertEquals(coCov.formatKey(rc.getInsertionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE)); + Assert.assertEquals(coCov.formatKey(rc.getDeletionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE)); + + // check cycle + Assert.assertEquals(cyCov.formatKey(rc.getMismatchesKeySet(i)[3]), "" + (i+1)); + Assert.assertEquals(cyCov.formatKey(rc.getInsertionsKeySet(i)[3]), "" + (i+1)); + Assert.assertEquals(cyCov.formatKey(rc.getDeletionsKeySet(i)[3]), "" + (i+1)); + } + + } + + } + + } + +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/ReadGroupCovariateUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/ReadGroupCovariateUnitTest.java new file mode 100644 index 000000000..34548aee3 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/ReadGroupCovariateUnitTest.java @@ -0,0 +1,125 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.engine.recalibration.covariates.ReadGroupCovariate; +import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +/** + * @author Mauricio Carneiro + * @since 3/1/12 + */ +public class ReadGroupCovariateUnitTest { + ReadGroupCovariate covariate; + RecalibrationArgumentCollection RAC; + + @BeforeClass + public void init() { + RAC = new RecalibrationArgumentCollection(); + covariate = new ReadGroupCovariate(); + covariate.initialize(RAC); + } + + @BeforeMethod + public void initCache() { + ReadCovariates.clearKeysCache(); + } + + @Test(enabled = true) + public void testSingleRecord() { + final String expected = "SAMPLE.1"; + GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.ID"); + rg.setPlatformUnit(expected); + runTest(rg, expected, covariate); + } + + @Test(enabled = true) + public void testMissingPlatformUnit() { + final String expected = "MY.7"; + GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(expected); + runTest(rg, expected, covariate); + } + + @Test(enabled = true) + public void testForceReadgroup() { + final RecalibrationArgumentCollection forcedRAC = new RecalibrationArgumentCollection(); + forcedRAC.FORCE_READGROUP = "FOO"; + final ReadGroupCovariate forcedCovariate = new ReadGroupCovariate(); + forcedCovariate.initialize(forcedRAC); + + final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("NOT_FOO"); + runTest(rg, "FOO", forcedCovariate); + } + + private static void runTest(final GATKSAMReadGroupRecord rg, final String expected, final ReadGroupCovariate covariate) { + GATKSAMRecord read = ReadUtils.createRandomRead(10); + read.setReadGroup(rg); + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + verifyCovariateArray(readCovariates.getMismatchesKeySet(), expected, covariate); + + } + + private static void verifyCovariateArray(final int[][] values, final String expected, final ReadGroupCovariate covariate) { + for (int[] value : values) { + String actual = covariate.formatKey(value[0]); + Assert.assertEquals(actual, expected); + } + } + +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalDatumUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalDatumUnitTest.java new file mode 100644 index 000000000..3c9048fae --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalDatumUnitTest.java @@ -0,0 +1,313 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + + +// the imports for unit testing. + + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.Collections; + + +public class RecalDatumUnitTest extends BaseTest { + + // -------------------------------------------------------------------------------- + // + // merge case Provider + // + // -------------------------------------------------------------------------------- + + private class RecalDatumTestProvider extends TestDataProvider { + int exError, exTotal, reportedQual; + + private RecalDatumTestProvider(int E, int N, int reportedQual) { + super(RecalDatumTestProvider.class); + + this.exError = E; + this.exTotal = N; + this.reportedQual = reportedQual; + } + + public double getErrorRate() { + return (exError + 1) / (1.0 * (exTotal + 2)); + } + + public double getErrorRatePhredScaled() { + return QualityUtils.phredScaleErrorRate(getErrorRate()); + } + + public int getReportedQual() { + return reportedQual; + } + + public RecalDatum makeRecalDatum() { + return new RecalDatum((long)exTotal, (double)exError, (byte)getReportedQual()); + } + + @Override + public String toString() { + return String.format("exError=%d, exTotal=%d, reportedQual=%d", exError, exTotal, reportedQual); + } + } + + private static boolean createdDatumTestProviders = false; + + @DataProvider(name = "RecalDatumTestProvider") + public Object[][] makeRecalDatumTestProvider() { + if ( !createdDatumTestProviders ) { + for ( int E : Arrays.asList(1, 10, 100, 1000, 10000) ) + for ( int N : Arrays.asList(10, 100, 1000, 10000, 100000, 1000000) ) + for ( int reportedQual : Arrays.asList(10, 20) ) + if ( E <= N ) + new RecalDatumTestProvider(E, N, reportedQual); + createdDatumTestProviders = true; + } + + return RecalDatumTestProvider.getTests(RecalDatumTestProvider.class); + } + + @Test(dataProvider = "RecalDatumTestProvider") + public void testRecalDatumBasics(RecalDatumTestProvider cfg) { + final RecalDatum datum = cfg.makeRecalDatum(); + assertBasicFeaturesOfRecalDatum(datum, cfg); + } + + private static void assertBasicFeaturesOfRecalDatum(final RecalDatum datum, final RecalDatumTestProvider cfg) { + Assert.assertEquals(datum.getNumMismatches(), cfg.exError, 1E-6); + Assert.assertEquals(datum.getNumObservations(), cfg.exTotal, 1E-6); + if ( cfg.getReportedQual() != -1 ) + Assert.assertEquals(datum.getEstimatedQReportedAsByte(), cfg.getReportedQual()); + assertEqualsDoubleSmart(datum.getEmpiricalErrorRate(), cfg.getErrorRate()); + + final double e = datum.getEmpiricalQuality(); + Assert.assertTrue(datum.getEmpiricalQualityAsByte() >= Math.floor(e)); + Assert.assertTrue(datum.getEmpiricalQualityAsByte() <= Math.ceil(e)); + Assert.assertNotNull(datum.toString()); + } + + @Test(dataProvider = "RecalDatumTestProvider") + public void testRecalDatumCopyAndCombine(RecalDatumTestProvider cfg) { + final RecalDatum datum = cfg.makeRecalDatum(); + final RecalDatum copy = new RecalDatum(datum); + assertBasicFeaturesOfRecalDatum(copy, cfg); + + RecalDatumTestProvider combinedCfg = new RecalDatumTestProvider(cfg.exError * 2, cfg.exTotal * 2, cfg.reportedQual); + copy.combine(datum); + assertBasicFeaturesOfRecalDatum(copy, combinedCfg); + } + + @Test(dataProvider = "RecalDatumTestProvider") + public void testRecalDatumModification(RecalDatumTestProvider cfg) { + RecalDatum datum = cfg.makeRecalDatum(); + datum.setEmpiricalQuality(10.1); + Assert.assertEquals(datum.getEmpiricalQuality(), 10.1); + + datum.setEstimatedQReported(10.1); + Assert.assertEquals(datum.getEstimatedQReported(), 10.1); + Assert.assertEquals(datum.getEstimatedQReportedAsByte(), 10); + + datum = cfg.makeRecalDatum(); + cfg.exTotal = 100000; + datum.setNumObservations(cfg.exTotal); + assertBasicFeaturesOfRecalDatum(datum, cfg); + + datum = cfg.makeRecalDatum(); + cfg.exError = 1000; + datum.setNumMismatches(cfg.exError); + assertBasicFeaturesOfRecalDatum(datum, cfg); + + datum = cfg.makeRecalDatum(); + datum.increment(true); + cfg.exError++; + cfg.exTotal++; + assertBasicFeaturesOfRecalDatum(datum, cfg); + + datum = cfg.makeRecalDatum(); + datum.increment(false); + cfg.exTotal++; + assertBasicFeaturesOfRecalDatum(datum, cfg); + + datum = cfg.makeRecalDatum(); + datum.incrementNumObservations(2); + cfg.exTotal += 2; + assertBasicFeaturesOfRecalDatum(datum, cfg); + + datum = cfg.makeRecalDatum(); + datum.incrementNumMismatches(2); + cfg.exError += 2; + assertBasicFeaturesOfRecalDatum(datum, cfg); + + + datum = cfg.makeRecalDatum(); + datum.increment(10, 5); + cfg.exError += 5; + cfg.exTotal += 10; + assertBasicFeaturesOfRecalDatum(datum, cfg); + } + + @Test + public void testNoObs() { + final RecalDatum rd = new RecalDatum(0L, 0.0, (byte)10); + Assert.assertEquals(rd.getEmpiricalErrorRate(), 0.0); + } + + @Test + public void testlog10QempPrior() { + for ( int Qemp = 0; Qemp <= QualityUtils.MAX_SAM_QUAL_SCORE; Qemp++ ) { + for ( int Qrep = 0; Qrep <= QualityUtils.MAX_SAM_QUAL_SCORE; Qrep++ ) { + final double log10prior = RecalDatum.log10QempPrior(Qemp, Qrep); + Assert.assertTrue(log10prior < 0.0); + Assert.assertFalse(Double.isInfinite(log10prior)); + Assert.assertFalse(Double.isNaN(log10prior)); + } + } + + final int Qrep = 20; + int maxQemp = -1; + double maxQempValue = -Double.MAX_VALUE; + for ( int Qemp = 0; Qemp <= QualityUtils.MAX_SAM_QUAL_SCORE; Qemp++ ) { + final double log10prior = RecalDatum.log10QempPrior(Qemp, Qrep); + if ( log10prior > maxQempValue ) { + maxQemp = Qemp; + maxQempValue = log10prior; + } + } + Assert.assertEquals(maxQemp, Qrep); + } + + @Test + public void testBayesianEstimateOfEmpiricalQuality() { + + final int Qrep = 20; + + // test no shift + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(0, 0, Qrep), (double)Qrep); + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10, 0, Qrep), (double)Qrep); + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000, 10, Qrep), (double)Qrep); + + // test small shift + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10, 10, Qrep), Qrep - 1.0); + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000, 0, Qrep), Qrep + 1.0); + + // test medium shift + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10000, 0, Qrep), Qrep + 3.0); + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10000, 10, Qrep), Qrep + 3.0); + + // test large shift + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(100000, 10, Qrep), Qrep + 8.0); + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000000, 10, Qrep), Qrep + 16.0); + } + + @Test + public void testlog10QempLikelihood() { + + final double[] Qemps = new double[] { 0.0, 10.0, 20.0, 30.0 }; + final int[] observations = new int[] {0, 10, 1000, 1000000}; + final int[] errors = new int[] {0, 10, 1000, 1000000}; + + for ( double Qemp : Qemps ) { + for ( int observation : observations ) { + for ( int error : errors ) { + if ( error > observation ) + continue; + + final double log10likelihood = RecalDatum.log10QempLikelihood(Qemp, observation, error); + Assert.assertTrue(observation == 0 ? MathUtils.compareDoubles(log10likelihood, 0.0) == 0 : log10likelihood < 0.0); + Assert.assertFalse(Double.isInfinite(log10likelihood)); + Assert.assertFalse(Double.isNaN(log10likelihood)); + } + } + } + + long bigNum = new Long((long)Integer.MAX_VALUE); + bigNum *= 2L; + final double log10likelihood = RecalDatum.log10QempLikelihood(30, bigNum, 100000); + Assert.assertTrue(log10likelihood < 0.0); + Assert.assertFalse(Double.isInfinite(log10likelihood)); + Assert.assertFalse(Double.isNaN(log10likelihood)); + } + + @Test + public void basicHierarchicalBayesianQualityEstimateTest() { + + for( double epsilon = 15.0; epsilon <= 60.0; epsilon += 2.0 ) { + double RG_Q = 45.0; + RecalDatum RG = new RecalDatum( (long)100000000, (long) (100000000 * 1.0 / (Math.pow(10.0, RG_Q/10.0))), (byte)RG_Q); + double Q = 30.0; + RecalDatum QS = new RecalDatum( (long)100000000, (long) (100000000 * 1.0 / (Math.pow(10.0, Q/10.0))), (byte)Q); + RecalDatum COV = new RecalDatum( (long)15, (long) 1, (byte)45.0); // no data here so Bayesian prior has a huge effect on the empirical quality + + // initial epsilon condition shouldn't matter when there are a lot of observations + Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( epsilon, RG, QS, Collections.singletonList(COV)), Q, 1E-4 ); + } + + for( double epsilon = 15.0; epsilon <= 60.0; epsilon += 2.0 ) { + double RG_Q = 45.0; + RecalDatum RG = new RecalDatum( (long)10, (long) (10 * 1.0 / (Math.pow(10.0, RG_Q/10.0))), (byte)RG_Q); + double Q = 30.0; + RecalDatum QS = new RecalDatum( (long)10, (long) (10 * 1.0 / (Math.pow(10.0, Q/10.0))), (byte)Q); + RecalDatum COV = new RecalDatum( (long)15, (long) 1, (byte)45.0); // no data here so Bayesian prior has a huge effect on the empirical quality + + // initial epsilon condition dominates when there is no data + Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( epsilon, RG, QS, Collections.singletonList(COV)), epsilon, 1E-4 ); + } + + } +} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalUtilsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalUtilsUnitTest.java new file mode 100644 index 000000000..0e95122da --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalUtilsUnitTest.java @@ -0,0 +1,178 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + +public final class RecalUtilsUnitTest extends BaseTest { + private class Row { + int rg, qual, ne, no; + + private Row(final Row copy) { + this(copy.rg, copy.qual, copy.ne, copy.no); + } + + private Row(int rg, int qual, int ne, int no) { + this.rg = rg; + this.qual = qual; + this.ne = ne; + this.no = no; + } + + @Override + public String toString() { + return "Row{" + + "" + rg + + ", " + qual + + ", " + ne + + ", " + no + + '}'; + } + } + + @DataProvider(name = "CombineTablesProvider") + public Object[][] createCombineTablesProvider() { + List tests = new ArrayList(); + + final List rows = new ArrayList(); + for ( final int rg : Arrays.asList(0, 1) ) { + for ( final int qual : Arrays.asList(0, 1) ) { + rows.add(new Row(rg, qual, 1, 10)); + } + } + + logger.warn("Number of rows " + rows.size()); + + List> permutations = new LinkedList>(); + permutations.addAll(Utils.makePermutations(rows, 1, false)); + permutations.addAll(Utils.makePermutations(rows, 2, false)); + permutations.addAll(Utils.makePermutations(rows, 3, false)); + + // adding 1 row to 2 + for ( final List table1 : permutations ) { + for ( final Row table2 : rows ) { + tests.add(new Object[]{table1, Arrays.asList(table2)}); + } + } + + // adding 2 rows to 1 + for ( final List table1 : permutations ) { + for ( final Row table2 : rows ) { + tests.add(new Object[]{Arrays.asList(table2), table1}); + } + } + + for ( final List table1 : permutations ) { + for ( final List table2 : permutations ) { + tests.add(new Object[]{table1, table2}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "CombineTablesProvider") + public void testCombineTables(final List table1, final List table2) { + final NestedIntegerArray nia1 = makeTable(table1); + final NestedIntegerArray nia2 = makeTable(table2); + final List expectedRows = makeExpected(table1, table2); + final NestedIntegerArray expected = makeTable(expectedRows); + RecalUtils.combineTables(nia1, nia2); + + Assert.assertEquals(nia1.getDimensions(), expected.getDimensions()); + Assert.assertEquals(nia1.getAllValues().size(), expected.getAllValues().size()); + + for ( final NestedIntegerArray.Leaf leaf : expected.getAllLeaves() ) { + final RecalDatum actual = nia1.get(leaf.keys); + Assert.assertEquals(actual.getNumMismatches(), leaf.value.getNumMismatches()); + Assert.assertEquals(actual.getNumObservations(), leaf.value.getNumObservations()); + } + } + + public List makeExpected(final List table1, final List table2) { + final List combined = new LinkedList(); + for ( final Row t1 : table1 ) combined.add(new Row(t1)); + for ( final Row t2 : table2 ) { + combine(combined, t2); + } + return combined; + } + + private void combine(final List combined, final Row row) { + for ( final Row c : combined ) { + if ( c.rg == row.rg && c.qual == row.qual ) { + c.ne += row.ne; + c.no += row.no; + return; + } + } + + combined.add(new Row(row)); + } + + public NestedIntegerArray makeTable(final List rows) { + final NestedIntegerArray x = new NestedIntegerArray(3, 3); + for ( final Row r : rows ) + x.put(new RecalDatum((long)r.no, (double)r.ne, (byte)10), r.rg, r.qual); + return x; + } +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationReportUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationReportUnitTest.java new file mode 100644 index 000000000..d16f718be --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationReportUnitTest.java @@ -0,0 +1,176 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.engine.recalibration.covariates.*; +import org.broadinstitute.gatk.utils.recalibration.EventType; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; +import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * @author carneiro + * @since 4/21/12 + */ +public class RecalibrationReportUnitTest { + @BeforeMethod + public void init() { + ReadCovariates.clearKeysCache(); + } + + private static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) { + final Random random = new Random(); + final int nObservations = random.nextInt(maxObservations); + final int nErrors = Math.min(random.nextInt(maxErrors), nObservations); + final int qual = random.nextInt(QualityUtils.MAX_SAM_QUAL_SCORE); + return new RecalDatum((long)nObservations, (double)nErrors, (byte)qual); + } + + @Test + public void testOutput() { + final int length = 100; + + List quals = new ArrayList(QualityUtils.MAX_SAM_QUAL_SCORE + 1); + List counts = new ArrayList(QualityUtils.MAX_SAM_QUAL_SCORE + 1); + + for (int i = 0; i<= QualityUtils.MAX_SAM_QUAL_SCORE; i++) { + quals.add((byte) i); + counts.add(1L); + } + + final QuantizationInfo quantizationInfo = new QuantizationInfo(quals, counts); + final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + + quantizationInfo.noQuantization(); + final List requiredCovariates = new LinkedList(); + final List optionalCovariates = new LinkedList(); + + final ReadGroupCovariate rgCovariate = new ReadGroupCovariate(); + rgCovariate.initialize(RAC); + requiredCovariates.add(rgCovariate); + + final QualityScoreCovariate qsCovariate = new QualityScoreCovariate(); + qsCovariate.initialize(RAC); + requiredCovariates.add(qsCovariate); + + final ContextCovariate cxCovariate = new ContextCovariate(); + cxCovariate.initialize(RAC); + optionalCovariates.add(cxCovariate); + final CycleCovariate cyCovariate = new CycleCovariate(); + cyCovariate.initialize(RAC); + optionalCovariates.add(cyCovariate); + + final Covariate[] requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; + int covariateIndex = 0; + for (final Covariate cov : requiredCovariates) + requestedCovariates[covariateIndex++] = cov; + for (final Covariate cov : optionalCovariates) + requestedCovariates[covariateIndex++] = cov; + + final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("id"); + rg.setPlatform("illumina"); + final GATKSAMRecord read = ReadUtils.createRandomRead(length, false); + read.setReadGroup(rg); + final byte [] readQuals = new byte[length]; + for (int i = 0; i < length; i++) + readQuals[i] = 20; + read.setBaseQualities(readQuals); + + final int expectedKeys = expectedNumberOfKeys(length, RAC.INDELS_CONTEXT_SIZE, RAC.MISMATCHES_CONTEXT_SIZE); + int nKeys = 0; // keep track of how many keys were produced + final ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); + + final RecalibrationTables recalibrationTables = new RecalibrationTables(requestedCovariates); + final NestedIntegerArray rgTable = recalibrationTables.getReadGroupTable(); + final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); + + for (int offset = 0; offset < length; offset++) { + + for (EventType errorMode : EventType.values()) { + + final int[] covariates = rc.getKeySet(offset, errorMode); + final int randomMax = errorMode == EventType.BASE_SUBSTITUTION ? 10000 : 100000; + + rgTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], errorMode.ordinal()); + qualTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], errorMode.ordinal()); + nKeys += 2; + for (int j = 0; j < optionalCovariates.size(); j++) { + final NestedIntegerArray covTable = recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + j); + final int covValue = covariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + j]; + if ( covValue >= 0 ) { + covTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], covValue, errorMode.ordinal()); + nKeys++; + } + } + } + } + Assert.assertEquals(nKeys, expectedKeys); + } + + private static int expectedNumberOfKeys (int readLength, int indelContextSize, int mismatchesContextSize) { + final int numCovariates = 4; + final int numTables = 3; + final int mismatchContextPadding = mismatchesContextSize - 1; + final int indelContextPadding = 2 * (indelContextSize - 1); + final int indelCyclePadding = 2 * (2 * CycleCovariate.CUSHION_FOR_INDELS); + + return (numCovariates * numTables * readLength) - mismatchContextPadding - indelContextPadding - indelCyclePadding; + } + +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationTablesUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationTablesUnitTest.java new file mode 100644 index 000000000..f40ef2602 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationTablesUnitTest.java @@ -0,0 +1,203 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.engine.recalibration.covariates.Covariate; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; +import org.broadinstitute.gatk.utils.recalibration.EventType; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.List; + +public final class RecalibrationTablesUnitTest extends BaseTest { + private RecalibrationTables tables; + private Covariate[] covariates; + private int numReadGroups = 6; + final byte qualByte = 1; + final List combineStates = Arrays.asList(0, 1, 2); + + @BeforeMethod + private void makeTables() { + covariates = RecalibrationTestUtils.makeInitializedStandardCovariates(); + tables = new RecalibrationTables(covariates, numReadGroups); + fillTable(tables); + } + + private void fillTable(final RecalibrationTables tables) { + for ( int iterations = 0; iterations < 10; iterations++ ) { + for ( final EventType et : EventType.values() ) { + for ( final int rg : combineStates) { + final double error = rg % 2 == 0 ? 1 : 0; + RecalUtils.incrementDatumOrPutIfNecessary(tables.getReadGroupTable(), qualByte, error, rg, et.ordinal()); + for ( final int qual : combineStates) { + RecalUtils.incrementDatumOrPutIfNecessary(tables.getQualityScoreTable(), qualByte, error, rg, qual, et.ordinal()); + for ( final int cycle : combineStates) + RecalUtils.incrementDatumOrPutIfNecessary(tables.getTable(2), qualByte, error, rg, qual, cycle, et.ordinal()); + for ( final int context : combineStates) + RecalUtils.incrementDatumOrPutIfNecessary(tables.getTable(3), qualByte, error, rg, qual, context, et.ordinal()); + } + } + } + } + } + + @Test + public void basicTest() { + final Covariate qualCov = covariates[1]; + final Covariate cycleCov = covariates[2]; + final Covariate contextCov = covariates[3]; + + Assert.assertEquals(tables.numTables(), covariates.length); + + Assert.assertNotNull(tables.getReadGroupTable()); + Assert.assertEquals(tables.getReadGroupTable(), tables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal())); + testDimensions(tables.getReadGroupTable(), numReadGroups); + + Assert.assertNotNull(tables.getQualityScoreTable()); + Assert.assertEquals(tables.getQualityScoreTable(), tables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE.ordinal())); + testDimensions(tables.getQualityScoreTable(), numReadGroups, qualCov.maximumKeyValue() + 1); + + Assert.assertNotNull(tables.getTable(2)); + testDimensions(tables.getTable(2), numReadGroups, qualCov.maximumKeyValue() + 1, cycleCov.maximumKeyValue() + 1); + + Assert.assertNotNull(tables.getTable(3)); + testDimensions(tables.getTable(3), numReadGroups, qualCov.maximumKeyValue() + 1, contextCov.maximumKeyValue() + 1); + } + + private void testDimensions(final NestedIntegerArray table, final int ... dimensions) { + final int[] dim = new int[dimensions.length+1]; + System.arraycopy(dimensions, 0, dim, 0, dimensions.length); + dim[dimensions.length] = EventType.values().length; + Assert.assertEquals(table.getDimensions().length, dim.length); + + for ( int i = 0; i < dim.length; i++ ) { + Assert.assertEquals(table.getDimensions()[i], dim[i], "Table dimensions not expected at dim " + i); + } + } + + @Test + public void basicMakeQualityScoreTable() { + final Covariate qualCov = covariates[1]; + final NestedIntegerArray copy = tables.makeQualityScoreTable(); + testDimensions(copy, numReadGroups, qualCov.maximumKeyValue()+1); + Assert.assertEquals(copy.getAllValues().size(), 0); + } + + @Test + public void testCombine1() { + final RecalibrationTables merged = new RecalibrationTables(covariates, numReadGroups); + fillTable(merged); + + merged.combine(tables); + + for ( int i = 0; i < tables.numTables(); i++ ) { + NestedIntegerArray table = tables.getTable(i); + NestedIntegerArray mergedTable = merged.getTable(i); + + Assert.assertEquals(table.getAllLeaves().size(), mergedTable.getAllLeaves().size()); + for ( final NestedIntegerArray.Leaf leaf : table.getAllLeaves() ) { + final RecalDatum mergedValue = mergedTable.get(leaf.keys); + Assert.assertNotNull(mergedValue); + Assert.assertEquals(mergedValue.getNumObservations(), leaf.value.getNumObservations() * 2); + Assert.assertEquals(mergedValue.getNumMismatches(), leaf.value.getNumMismatches() * 2); + } + } + } + + @Test + public void testCombineEmptyOther() { + final RecalibrationTables merged = new RecalibrationTables(covariates, numReadGroups); + + merged.combine(tables); + + for ( int i = 0; i < tables.numTables(); i++ ) { + NestedIntegerArray table = tables.getTable(i); + NestedIntegerArray mergedTable = merged.getTable(i); + + Assert.assertEquals(table.getAllLeaves().size(), mergedTable.getAllLeaves().size()); + for ( final NestedIntegerArray.Leaf leaf : table.getAllLeaves() ) { + final RecalDatum mergedValue = mergedTable.get(leaf.keys); + Assert.assertNotNull(mergedValue); + Assert.assertEquals(mergedValue.getNumObservations(), leaf.value.getNumObservations()); + Assert.assertEquals(mergedValue.getNumMismatches(), leaf.value.getNumMismatches()); + } + } + } + + @Test + public void testCombinePartial() { + final RecalibrationTables merged = new RecalibrationTables(covariates, numReadGroups); + for ( final int rg : combineStates) { + RecalUtils.incrementDatumOrPutIfNecessary(merged.getTable(3), qualByte, 1, rg, 0, 0, 0); + } + + merged.combine(tables); + for ( int i = 0; i < tables.numTables(); i++ ) { + NestedIntegerArray table = tables.getTable(i); + NestedIntegerArray mergedTable = merged.getTable(i); + + Assert.assertEquals(table.getAllLeaves().size(), mergedTable.getAllLeaves().size()); + for ( final NestedIntegerArray.Leaf leaf : table.getAllLeaves() ) { + final RecalDatum mergedValue = mergedTable.get(leaf.keys); + Assert.assertNotNull(mergedValue); + + final int delta = i == 3 && leaf.keys[1] == 0 && leaf.keys[2] == 0 && leaf.keys[3] == 0 ? 1 : 0; + Assert.assertEquals(mergedValue.getNumObservations(), leaf.value.getNumObservations() + delta); + Assert.assertEquals(mergedValue.getNumMismatches(), leaf.value.getNumMismatches() + delta); + } + } + } +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationTestUtils.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationTestUtils.java new file mode 100644 index 000000000..ce374b047 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationTestUtils.java @@ -0,0 +1,74 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.engine.recalibration.covariates.*; + +/** + * Created with IntelliJ IDEA. + * User: depristo + * Date: 12/23/12 + * Time: 1:06 PM + * To change this template use File | Settings | File Templates. + */ +public class RecalibrationTestUtils { + public static Covariate[] makeInitializedStandardCovariates() { + final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + final Covariate[] covariates = new Covariate[4]; + covariates[0] = new ReadGroupCovariate(); + covariates[1] = new QualityScoreCovariate(); + covariates[2] = new ContextCovariate(); + covariates[3] = new CycleCovariate(); + for ( Covariate cov : covariates ) cov.initialize(RAC); + return covariates; + } +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RepeatCovariatesUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RepeatCovariatesUnitTest.java new file mode 100644 index 000000000..66c12a55a --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RepeatCovariatesUnitTest.java @@ -0,0 +1,252 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import com.google.java.contract.Requires; +import org.broadinstitute.gatk.engine.recalibration.covariates.Covariate; +import org.broadinstitute.gatk.engine.recalibration.covariates.RepeatLengthCovariate; +import org.broadinstitute.gatk.engine.recalibration.covariates.RepeatUnitAndLengthCovariate; +import org.broadinstitute.gatk.engine.recalibration.covariates.RepeatUnitCovariate; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.gatk.utils.BaseUtils; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Random; + +public class RepeatCovariatesUnitTest { + + RepeatLengthCovariate rlCovariate; + RepeatUnitCovariate ruCovariate; + RepeatUnitAndLengthCovariate rurlCovariate; + RecalibrationArgumentCollection RAC; + + + + @BeforeClass + public void init() { + RAC = new RecalibrationArgumentCollection(); + rlCovariate = new RepeatLengthCovariate(); + ruCovariate = new RepeatUnitCovariate(); + rurlCovariate = new RepeatUnitAndLengthCovariate(); + rlCovariate.initialize(RAC); + ruCovariate.initialize(RAC); + rurlCovariate.initialize(RAC); + } + + @BeforeMethod + public void initCache() { + ReadCovariates.clearKeysCache(); + } + + + @Test + public void testFindNumberOfRepetitions() { + // First, test logic to compute number of repetitions of a substring on a given string. + int result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACAC".getBytes(), true); + Assert.assertEquals(2,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACAC".getBytes(), true); + Assert.assertEquals(4,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), true); + Assert.assertEquals(4,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), true); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), true); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), true); + Assert.assertEquals(1,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), true); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), true); + Assert.assertEquals(0,result); + // Same tests but looking backward on string + result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACAC".getBytes(), false); + Assert.assertEquals(2,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACAC".getBytes(), false); + Assert.assertEquals(4,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), false); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), false); + Assert.assertEquals(4,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), false); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), false); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), false); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), false); + Assert.assertEquals(3,result); + + // test logic to get repeat unit and number of repeats from covariate value + final String[] repUnits = new String[]{"AG","CCG","TCCA","T"}; + for (String ru : repUnits) { + for (int k=1; k < 10; k++) { + Pair pair = RepeatLengthCovariate.getRUandNRfromCovariate(String.format("%s%d",ru,k)); + Assert.assertEquals(pair.second.intValue(),k); + Assert.assertEquals(pair.first,ru); + } + } + + } + + /** + * Build synthetic reads with random content made up of tandem repeats, record computed Repeat Unit and # repeats and see if + * they match with read context + */ + @Test + public void testManyObservations() { + final int NUM_UNITS = 10; + final int MAX_REPEAT_UNIT_LENGTH = RAC.MAX_STR_UNIT_LENGTH; + final int MAX_NUM_REPETITIONS = RAC.MAX_REPEAT_LENGTH; + final int NUM_TEST_CASES = 100; + + Random random = new Random(); + + for (int r = 0; r < NUM_TEST_CASES; r++) { + final StringBuilder sb = new StringBuilder(); + // for each unit, generate a repeat unit at random with given random length + final ArrayList repeatUnits = new ArrayList(); + final ArrayList numsRepetitions = new ArrayList(); + for (int n=0; n < NUM_UNITS; n++) { + final int repLength = 1+random.nextInt(MAX_REPEAT_UNIT_LENGTH); + final String repeatUnit = getRandomBases(repLength); + final int numRepetitions = 1+random.nextInt(MAX_NUM_REPETITIONS); + + // log for comparison with covariate + numsRepetitions.add(numRepetitions); + repeatUnits.add(repeatUnit); + + for (int k=0; k < numRepetitions; k++) + sb.append(repeatUnit); + + } + + final String readBases = sb.toString(); + System.out.println(readBases); + final int readLength = readBases.length(); + + final byte[] readQuals = new byte[readLength]; + Arrays.fill(readQuals,(byte)30); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(),readQuals,readLength+"M"); + + Covariate[] requestedCovariates = new Covariate[3]; + requestedCovariates[0] = rlCovariate; + requestedCovariates[1] = ruCovariate; + requestedCovariates[2] = rurlCovariate; + ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); + + // check that the length is correct + Assert.assertEquals(rc.getMismatchesKeySet().length, readLength); + Assert.assertEquals(rc.getInsertionsKeySet().length, readLength); + Assert.assertEquals(rc.getDeletionsKeySet().length, readLength); + + for (int offset = 0; offset < readBases.length(); offset++) { // recalibrate all bases in the read + // check RepeatLength + final String rlValM = rlCovariate.formatKey(rc.getMismatchesKeySet(offset)[0]); + final String rlValI = rlCovariate.formatKey(rc.getInsertionsKeySet(offset)[0]); + final String rlValD = rlCovariate.formatKey(rc.getDeletionsKeySet(offset)[0]); + // check RepeatUnit + final String ruValM = ruCovariate.formatKey(rc.getMismatchesKeySet(offset)[1]); + final String ruValI = ruCovariate.formatKey(rc.getInsertionsKeySet(offset)[1]); + final String ruValD = ruCovariate.formatKey(rc.getDeletionsKeySet(offset)[1]); + // check RepeatUnitAndLength + final String rurlValM = rurlCovariate.formatKey(rc.getMismatchesKeySet(offset)[2]); + final String rurlValI = rurlCovariate.formatKey(rc.getInsertionsKeySet(offset)[2]); + final String rurlValD = rurlCovariate.formatKey(rc.getDeletionsKeySet(offset)[2]); + // check all 3 values are identical + Assert.assertEquals(rlValD,rlValI); + Assert.assertEquals(rlValM,rlValI); + Assert.assertEquals(ruValD,ruValI); + Assert.assertEquals(ruValM,ruValI); + Assert.assertEquals(rurlValD,rurlValI); + Assert.assertEquals(rurlValM,rurlValI); + + + int fw = GATKVariantContextUtils.findNumberOfRepetitions(ruValM.getBytes(), readBases.substring(offset + 1, readLength).getBytes(), true); + int bw = GATKVariantContextUtils.findNumberOfRepetitions(ruValM.getBytes(), readBases.substring(0, offset + 1).getBytes(), false); + Assert.assertEquals(Math.min(fw+bw,RAC.MAX_REPEAT_LENGTH),(int)Integer.valueOf(rlValM)); + } + + } + + + + + + + } + + /** + * Returns random bases of given length + * @param length required length + * @return given random string + */ + @Requires("length > 0") + String getRandomBases(final int length) { + byte[] bases = new byte[length]; + Random ran = new Random(); + for (int i=0; i < length; i++ ) { + int idx = ran.nextInt(4); + bases[i] = BaseUtils.baseIndexToSimpleBase(idx); + } + return new String(bases); + } + + +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffEngineUnitTest.java deleted file mode 100644 index 4922e69d6..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffEngineUnitTest.java +++ /dev/null @@ -1,259 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - - -// the imports for unit testing. - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffElement; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffEngine; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffNode; -import org.broadinstitute.gatk.engine.walkers.diffengine.Difference; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -/** - * Basic unit test for DifferableReaders in reduced reads - */ -public class DiffEngineUnitTest extends BaseTest { - DiffEngine engine; - - @BeforeClass(enabled = true) - public void createDiffEngine() { - engine = new DiffEngine(); - } - - // -------------------------------------------------------------------------------- - // - // Difference testing routines - // - // -------------------------------------------------------------------------------- - - private class DifferenceTest extends TestDataProvider { - public DiffElement tree1, tree2; - public List differences; - - private DifferenceTest(String tree1, String tree2) { - this(tree1, tree2, Collections.emptyList()); - } - - private DifferenceTest(String tree1, String tree2, String difference) { - this(tree1, tree2, Arrays.asList(difference)); - } - - private DifferenceTest(String tree1, String tree2, List differences) { - super(DifferenceTest.class); - this.tree1 = DiffNode.fromString(tree1); - this.tree2 = DiffNode.fromString(tree2); - this.differences = differences; - } - - public String toString() { - return String.format("tree1=%s tree2=%s diff=%s", - tree1.toOneLineString(), tree2.toOneLineString(), differences); - } - } - - @DataProvider(name = "trees") - public Object[][] createTrees() { - new DifferenceTest("A=X", "A=X"); - new DifferenceTest("A=X", "A=Y", "A:X!=Y"); - new DifferenceTest("A=X", "B=X", Arrays.asList("A:X!=MISSING", "B:MISSING!=X")); - new DifferenceTest("A=(X=1)", "B=(X=1)", Arrays.asList("A:(X=1)!=MISSING", "B:MISSING!=(X=1)")); - new DifferenceTest("A=(X=1)", "A=(X=1)"); - new DifferenceTest("A=(X=1 Y=2)", "A=(X=1 Y=2)"); - new DifferenceTest("A=(X=1 Y=2 B=(Z=3))", "A=(X=1 Y=2 B=(Z=3))"); - new DifferenceTest("A=(X=1)", "A=(X=2)", "A.X:1!=2"); - new DifferenceTest("A=(X=1 Y=2 B=(Z=3))", "A=(X=1 Y=2 B=(Z=4))", "A.B.Z:3!=4"); - new DifferenceTest("A=(X=1)", "A=(X=1 Y=2)", "A.Y:MISSING!=2"); - new DifferenceTest("A=(X=1 Y=2 B=(Z=3))", "A=(X=1 Y=2)", "A.B:(Z=3)!=MISSING"); - return DifferenceTest.getTests(DifferenceTest.class); - } - - @Test(enabled = true, dataProvider = "trees") - public void testDiffs(DifferenceTest test) { - logger.warn("Test tree1: " + test.tree1.toOneLineString()); - logger.warn("Test tree2: " + test.tree2.toOneLineString()); - - List diffs = engine.diff(test.tree1, test.tree2); - logger.warn("Test expected diff : " + test.differences); - logger.warn("Observed diffs : " + diffs); - } - - // -------------------------------------------------------------------------------- - // - // Low-level routines for summarizing differences - // - // -------------------------------------------------------------------------------- - - @Test(enabled = true) - public void testLongestCommonPostfix() { - testLongestCommonPostfixHelper("A", "A", 1); - testLongestCommonPostfixHelper("A", "B", 0); - testLongestCommonPostfixHelper("A.B", "A.B", 2); - testLongestCommonPostfixHelper("A.B.C", "A.B.C", 3); - testLongestCommonPostfixHelper("A.B.C", "X.B.C", 2); - testLongestCommonPostfixHelper("A.B.C", "X.Y.C", 1); - testLongestCommonPostfixHelper("A.B.C", "X.Y.Z", 0); - testLongestCommonPostfixHelper("A.B.C", "A.X.C", 1); - testLongestCommonPostfixHelper("A.B.C", "A.X.Z", 0); - testLongestCommonPostfixHelper("A.B.C", "A.B.Z", 0); - } - - public void testLongestCommonPostfixHelper(String p1, String p2, int expected) { - String[] parts1 = p1.split("\\."); - String[] parts2 = p2.split("\\."); - int obs = DiffEngine.longestCommonPostfix(parts1, parts2); - Assert.assertEquals(obs, expected, "p1=" + p1 + " p2=" + p2 + " failed"); - } - - @Test(enabled = true, dependsOnMethods = "testLongestCommonPostfix") - public void testSummarizePath() { - testSummarizePathHelper("A", "A", "A"); - testSummarizePathHelper("A", "B", "*"); - testSummarizePathHelper("A.B", "A.B", "A.B"); - testSummarizePathHelper("A.B", "X.B", "*.B"); - testSummarizePathHelper("A.B", "X.Y", "*.*"); - testSummarizePathHelper("A.B.C", "A.B.C", "A.B.C"); - testSummarizePathHelper("A.B.C", "X.B.C", "*.B.C"); - testSummarizePathHelper("A.B.C", "X.Y.C", "*.*.C"); - testSummarizePathHelper("A.B.C", "X.Y.Z", "*.*.*"); - testSummarizePathHelper("A.B.C", "A.X.C", "*.*.C"); - testSummarizePathHelper("A.B.C", "A.X.Z", "*.*.*"); - testSummarizePathHelper("A.B.C", "A.B.Z", "*.*.*"); - } - - public void testSummarizePathHelper(String p1, String p2, String expected) { - String[] parts1 = DiffEngine.diffNameToPath(p1); - String[] parts2 = DiffEngine.diffNameToPath(p2); - int obs = DiffEngine.longestCommonPostfix(parts1, parts2); - String path = DiffEngine.summarizedPath(parts2, obs); - Assert.assertEquals(path, expected, "p1=" + p1 + " p2=" + p2 + " failed"); - } - - // -------------------------------------------------------------------------------- - // - // High-level difference summary - // - // -------------------------------------------------------------------------------- - - private class SummarizeDifferenceTest extends TestDataProvider { - List diffs = new ArrayList(); - List expecteds = new ArrayList(); - - public SummarizeDifferenceTest() { super(SummarizeDifferenceTest.class); } - - public SummarizeDifferenceTest addDiff(String... diffsToAdd) { - diffs.addAll(Arrays.asList(diffsToAdd)); - return this; - } - - public SummarizeDifferenceTest addSummary(String... expectedSummary) { - expecteds.addAll(Arrays.asList(expectedSummary)); - return this; - } - - public String toString() { - return String.format("diffs=%s => expected=%s", diffs, expecteds); - } - - public void test() { - List diffPaths = new ArrayList(diffs.size()); - for ( String diff : diffs ) { diffPaths.add(DiffEngine.diffNameToPath(diff)); } - - List sumDiffs = engine.summarizedDifferencesOfPathsFromString(diffs); - - Assert.assertEquals(sumDiffs.size(), expecteds.size(), "Unexpected number of summarized differences: " + sumDiffs); - - for ( int i = 0; i < sumDiffs.size(); i++ ) { - Difference sumDiff = sumDiffs.get(i); - String expected = expecteds.get(i); - String[] pathCount = expected.split(":"); - String path = pathCount[0]; - int count = Integer.valueOf(pathCount[1]); - Assert.assertEquals(sumDiff.getPath(), path, "Unexpected path at: " + expected + " obs=" + sumDiff + " all=" + sumDiffs); - Assert.assertEquals(sumDiff.getCount(), count, "Unexpected counts at: " + expected + " obs=" + sumDiff + " all=" + sumDiffs); - } - } - } - - @DataProvider(name = "summaries") - public Object[][] createSummaries() { - new SummarizeDifferenceTest().addDiff("A", "A").addSummary("A:2"); - new SummarizeDifferenceTest().addDiff("A", "B").addSummary("A:1", "B:1"); - new SummarizeDifferenceTest().addDiff("A", "A", "A").addSummary("A:3"); - new SummarizeDifferenceTest().addDiff("A", "A", "A", "B").addSummary("A:3", "B:1"); - new SummarizeDifferenceTest().addDiff("A", "A", "A", "B", "B").addSummary("A:3", "B:2"); - new SummarizeDifferenceTest().addDiff("A", "A", "A", "B", "B", "C").addSummary("A:3", "B:2", "C:1"); - new SummarizeDifferenceTest().addDiff("A.X", "A.X").addSummary("A.X:2"); - new SummarizeDifferenceTest().addDiff("A.X", "A.X", "B.X").addSummary("*.X:3", "A.X:2", "B.X:1"); - new SummarizeDifferenceTest().addDiff("A.X", "A.X", "B.X", "B.X").addSummary("*.X:4", "A.X:2", "B.X:2"); - new SummarizeDifferenceTest().addDiff("A.B.C", "X.B.C").addSummary("*.B.C:2", "A.B.C:1", "X.B.C:1"); - new SummarizeDifferenceTest().addDiff("A.B.C", "X.Y.C", "X.Y.C").addSummary("*.*.C:3", "X.Y.C:2", "A.B.C:1"); - new SummarizeDifferenceTest().addDiff("A.B.C", "A.X.C", "X.Y.C").addSummary("*.*.C:3", "A.B.C:1", "A.X.C:1", "X.Y.C:1"); - new SummarizeDifferenceTest().addDiff("A.B.C", "A.X.C", "B.X.C").addSummary("*.*.C:3", "*.X.C:2", "A.B.C:1", "A.X.C:1", "B.X.C:1"); - new SummarizeDifferenceTest().addDiff("A.B.C", "A.X.C", "B.X.C", "B.X.C").addSummary("*.*.C:4", "*.X.C:3", "B.X.C:2", "A.B.C:1", "A.X.C:1"); - - return SummarizeDifferenceTest.getTests(SummarizeDifferenceTest.class); - } - - - @Test(enabled = true, dependsOnMethods = "testSummarizePath", dataProvider = "summaries") - public void testSummarizeDifferences(SummarizeDifferenceTest test) { - test.test(); - } -} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffNodeUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffNodeUnitTest.java deleted file mode 100644 index 388ba518f..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffNodeUnitTest.java +++ /dev/null @@ -1,278 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - - -// the imports for unit testing. - - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffElement; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffNode; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffValue; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -/** - * Basic unit test for DifferableReaders in reduced reads - */ -public class DiffNodeUnitTest extends BaseTest { - // Data is: - // MY_ROOT - // fields: A=A, B=B - // nodes: C, D - // C: fields: E=E, nodes: none - // D: fields: F=F, G=G, nodes: none - static DiffNode MY_ROOT = DiffNode.rooted("MY_ROOT"); - static DiffValue Value_A = new DiffValue("A", MY_ROOT, "A"); - static DiffValue Value_B = new DiffValue("B", MY_ROOT, "B"); - static DiffNode NODE_C = DiffNode.empty("C", MY_ROOT); - static DiffNode NODE_D = DiffNode.empty("D", MY_ROOT); - static DiffValue Value_E = new DiffValue("E", NODE_C, "E"); - static DiffValue Value_F = new DiffValue("F", NODE_D, "F"); - static DiffValue Value_G = new DiffValue("G", NODE_D, "G"); - - static { - MY_ROOT.add(Value_A); - MY_ROOT.add(Value_B); - MY_ROOT.add(NODE_C); - MY_ROOT.add(NODE_D); - NODE_C.add(Value_E); - NODE_D.add(Value_F); - NODE_D.add(Value_G); - } - - - // -------------------------------------------------------------------------------- - // - // Element testing routines - // - // -------------------------------------------------------------------------------- - - private class ElementTest extends TestDataProvider { - public DiffElement elt; - public String name; - public String fullName; - public DiffElement parent; - - private ElementTest(DiffValue elt, DiffValue parent, String name, String fullName) { - this(elt.getBinding(), parent.getBinding(), name, fullName); - } - - private ElementTest(DiffElement elt, DiffElement parent, String name, String fullName) { - super(ElementTest.class); - this.elt = elt; - this.name = name; - this.fullName = fullName; - this.parent = parent; - } - - public String toString() { - return String.format("ElementTest elt=%s name=%s fullName=%s parent=%s", - elt.toOneLineString(), name, fullName, parent.getName()); - } - } - - @DataProvider(name = "elementdata") - public Object[][] createElementData() { - new ElementTest(MY_ROOT.getBinding(), DiffElement.ROOT, "MY_ROOT", "MY_ROOT"); - new ElementTest(NODE_C, MY_ROOT, "C", "MY_ROOT.C"); - new ElementTest(NODE_D, MY_ROOT, "D", "MY_ROOT.D"); - new ElementTest(Value_A, MY_ROOT, "A", "MY_ROOT.A"); - new ElementTest(Value_B, MY_ROOT, "B", "MY_ROOT.B"); - new ElementTest(Value_E, NODE_C, "E", "MY_ROOT.C.E"); - new ElementTest(Value_F, NODE_D, "F", "MY_ROOT.D.F"); - new ElementTest(Value_G, NODE_D, "G", "MY_ROOT.D.G"); - return TestDataProvider.getTests(ElementTest.class); - } - - @Test(enabled = true, dataProvider = "elementdata") - public void testElementMethods(ElementTest test) { - Assert.assertNotNull(test.elt.getName()); - Assert.assertNotNull(test.elt.getParent()); - Assert.assertEquals(test.elt.getName(), test.name); - Assert.assertEquals(test.elt.getParent(), test.parent); - Assert.assertEquals(test.elt.fullyQualifiedName(), test.fullName); - } - - // -------------------------------------------------------------------------------- - // - // DiffValue testing routines - // - // -------------------------------------------------------------------------------- - - private class LeafTest extends TestDataProvider { - public DiffValue diffvalue; - public Object value; - - private LeafTest(DiffValue diffvalue, Object value) { - super(LeafTest.class); - this.diffvalue = diffvalue; - this.value = value; - } - - public String toString() { - return String.format("LeafTest diffvalue=%s value=%s", diffvalue.toOneLineString(), value); - } - } - - @DataProvider(name = "leafdata") - public Object[][] createLeafData() { - new LeafTest(Value_A, "A"); - new LeafTest(Value_B, "B"); - new LeafTest(Value_E, "E"); - new LeafTest(Value_F, "F"); - new LeafTest(Value_G, "G"); - return TestDataProvider.getTests(LeafTest.class); - } - - @Test(enabled = true, dataProvider = "leafdata") - public void testLeafMethods(LeafTest test) { - Assert.assertNotNull(test.diffvalue.getValue()); - Assert.assertEquals(test.diffvalue.getValue(), test.value); - } - - // -------------------------------------------------------------------------------- - // - // Node testing routines - // - // -------------------------------------------------------------------------------- - - private class NodeTest extends TestDataProvider { - public DiffNode node; - public Set fields; - public Set subnodes; - public Set allNames; - - private NodeTest(DiffNode node, List fields, List subnodes) { - super(NodeTest.class); - this.node = node; - this.fields = new HashSet(fields); - this.subnodes = new HashSet(subnodes); - this.allNames = new HashSet(fields); - allNames.addAll(subnodes); - } - - public String toString() { - return String.format("NodeTest node=%s fields=%s subnodes=%s", - node.toOneLineString(), fields, subnodes); - } - } - - @DataProvider(name = "nodedata") - public Object[][] createData1() { - new NodeTest(MY_ROOT, Arrays.asList("A", "B"), Arrays.asList("C", "D")); - new NodeTest(NODE_C, Arrays.asList("E"), Collections.emptyList()); - new NodeTest(NODE_D, Arrays.asList("F", "G"), Collections.emptyList()); - return TestDataProvider.getTests(NodeTest.class); - } - - @Test(enabled = true, dataProvider = "nodedata") - public void testNodeAccessors(NodeTest test) { - Assert.assertNotNull(test.node.getElements()); - - for ( String name : test.allNames ) { - DiffElement elt = test.node.getElement(name); - Assert.assertNotNull(elt, "Failed to find field " + elt + " in " + test.node); - Assert.assertEquals(elt.getName(), name); - Assert.assertEquals(elt.getValue().isAtomic(), test.fields.contains(name), "Failed atomic/compound expectation: " + test.node); - } - } - - // NOTE: add routines are being implicitly tested by the creation of the data structures - - @Test(enabled = true, dataProvider = "nodedata") - public void testCounts(NodeTest test) { - Assert.assertEquals(test.node.getElements().size(), test.allNames.size()); - Assert.assertEquals(test.node.getElementNames(), test.allNames); - } - - // -------------------------------------------------------------------------------- - // - // fromString testing routines - // - // -------------------------------------------------------------------------------- - - private class FromStringTest extends TestDataProvider { - public String string; - public DiffElement expected; - - private FromStringTest(String string, DiffElement expected) { - super(FromStringTest.class); - this.string = string; - this.expected = expected; - } - - public String toString() { - return String.format("FromStringTest string=%s expected=%s", string, expected.toOneLineString()); - } - } - - @DataProvider(name = "fromstringdata") - public Object[][] createFromData() { - new FromStringTest("A=A", Value_A.getBinding()); - new FromStringTest("B=B", Value_B.getBinding()); - new FromStringTest("C=(E=E)", NODE_C.getBinding()); - new FromStringTest("D=(F=F G=G)", NODE_D.getBinding()); - return TestDataProvider.getTests(FromStringTest.class); - } - - @Test(enabled = true, dataProvider = "fromstringdata") - public void parseFromString(FromStringTest test) { - logger.warn("Testing from string: " + test.string); - DiffElement elt = DiffNode.fromString(test.string); - Assert.assertEquals(elt.toOneLineString(), test.expected.toOneLineString()); - } -} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffObjectsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffObjectsIntegrationTest.java deleted file mode 100644 index c3108f055..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffObjectsIntegrationTest.java +++ /dev/null @@ -1,102 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - -import org.broadinstitute.gatk.engine.walkers.WalkerTest; -import org.broadinstitute.gatk.utils.BaseTest; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.Arrays; - -public class DiffObjectsIntegrationTest extends WalkerTest { - private class TestParams extends TestDataProvider { - public File master, test; - public String MD5; - public boolean doPairwise; - - private TestParams(String master, String test, final boolean doPairwise, String MD5) { - super(TestParams.class); - this.master = new File(master); - this.test = new File(test); - this.MD5 = MD5; - this.doPairwise = doPairwise; - } - - public String toString() { - return String.format("master=%s,test=%s,md5=%s", master, test, MD5); - } - } - - @DataProvider(name = "data") - public Object[][] createData() { - new TestParams(privateTestDir + "diffTestMaster.vcf", privateTestDir + "diffTestTest.vcf", true, "71869ddf9665773a842a9def4cc5f3c8"); - new TestParams(publicTestDir + "exampleBAM.bam", publicTestDir + "exampleBAM.simple.bam", true, "cec7c644c84ef9c96aacaed604d9ec9b"); - new TestParams(privateTestDir + "diffTestMaster.vcf", privateTestDir + "diffTestTest.vcf", false, "47546e03344103020e49d8037a7e0727"); - new TestParams(publicTestDir + "exampleBAM.bam", publicTestDir + "exampleBAM.simple.bam", false, "d27b37f7a366c8dacca5cd2590d3c6ce"); - return TestParams.getTests(TestParams.class); - } - - @Test(enabled = true, dataProvider = "data") - public void testDiffs(TestParams params) { - WalkerTestSpec spec = new WalkerTestSpec( - "-T DiffObjects -R " + publicTestDir + "exampleFASTA.fasta " - + " -m " + params.master - + " -t " + params.test - + (params.doPairwise ? " -doPairwise " : "") - + " -o %s", - Arrays.asList(params.MD5)); - executeTest("testDiffObjects:"+params, spec).getFirst(); - } -} - diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffableReaderUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffableReaderUnitTest.java deleted file mode 100644 index 26b786022..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffableReaderUnitTest.java +++ /dev/null @@ -1,173 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - - -// the imports for unit testing. - - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffElement; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffEngine; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffNode; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffableReader; -import htsjdk.variant.vcf.VCFConstants; -import htsjdk.variant.variantcontext.Allele; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.*; - -/** - * Basic unit test for DifferableReaders in reduced reads - */ -public class DiffableReaderUnitTest extends BaseTest { - DiffEngine engine; - - File vcfFile = new File(privateTestDir + "diffTestMaster.vcf"); - File bamFile = new File(publicTestDir + "exampleBAM.bam"); - - @BeforeClass(enabled = true) - public void createDiffEngine() { - engine = new DiffEngine(); - } - - @Test(enabled = true) - public void testPluggableDiffableReaders() { - logger.warn("testPluggableDiffableReaders"); - Map readers = engine.getReaders(); - Assert.assertNotNull(readers); - Assert.assertTrue(readers.size() > 0); - Assert.assertNotNull(readers.get("VCF")); - for ( Map.Entry e : engine.getReaders().entrySet() ) { - logger.warn("Found diffable reader: " + e.getKey()); - Assert.assertEquals(e.getValue().getName(), e.getKey()); - Assert.assertEquals(e.getValue(), engine.getReader(e.getKey())); - } - } - - private static void testLeaf(DiffNode rec, String field, Object expected) { - DiffElement value = rec.getElement(field); - Assert.assertNotNull(value, "Expected to see leaf named " + field + " in rec " + rec); - Assert.assertEquals(value.getValue().getValue(), expected, "Expected to see leaf named " + field + " to have value " + expected + " in rec " + rec + " but got instead " + value.getValue().getValue()); - } - - @Test(enabled = true, dependsOnMethods = "testPluggableDiffableReaders") - public void testVCF1() { - logger.warn("testVCF1"); - DiffableReader vcfReader = engine.getReader("VCF"); - Assert.assertTrue(vcfReader.canRead(vcfFile)); - Assert.assertFalse(vcfReader.canRead(bamFile)); - - DiffElement diff = vcfReader.readFromFile(vcfFile, -1); - Assert.assertNotNull(diff); - - Assert.assertEquals(diff.getName(), vcfFile.getName()); - Assert.assertSame(diff.getParent(), DiffElement.ROOT); - - DiffNode node = diff.getValueAsNode(); - Assert.assertEquals(node.getElements().size(), 11); - - // chr1 2646 rs62635284 G A 0.15 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:53,75:3:-12.40,-0.90,-0.00:9.03 - DiffNode rec1 = node.getElement("chr1:2646").getValueAsNode(); - testLeaf(rec1, "CHROM", "chr1"); - testLeaf(rec1, "POS", 2646); - testLeaf(rec1, "ID", "rs62635284"); - testLeaf(rec1, "REF", Allele.create("G", true)); - testLeaf(rec1, "ALT", Arrays.asList(Allele.create("A"))); - testLeaf(rec1, "QUAL", 0.15); - testLeaf(rec1, "FILTER", VCFConstants.PASSES_FILTERS_v4); - testLeaf(rec1, "AC", "2"); - testLeaf(rec1, "AF", "1.00"); - testLeaf(rec1, "AN", "2"); - } - - @Test(enabled = true, dependsOnMethods = "testPluggableDiffableReaders") - public void testBAM() { - logger.warn("testBAM"); - DiffableReader bamReader = engine.getReader("BAM"); - Assert.assertTrue(bamReader.canRead(bamFile)); - Assert.assertFalse(bamReader.canRead(vcfFile)); - - DiffElement diff = bamReader.readFromFile(bamFile, -1); - Assert.assertNotNull(diff); - - Assert.assertEquals(diff.getName(), bamFile.getName()); - Assert.assertSame(diff.getParent(), DiffElement.ROOT); - - DiffNode node = diff.getValueAsNode(); - Assert.assertEquals(node.getElements().size(), 33); - - // 30PPJAAXX090125:1:42:512:1817#0 99 chr1 200 0 76M = - // 255 -130 ACCCTAACCCTAACCCTAACCCTAACCATAACCCTAAGACTAACCCTAAACCTAACCCTCATAATCGAAATACAAC - // BBBBC@C?AABCBB<63>=B@>+B9-9+)2B8,+@327B5A>90((>-+''3?(/'''A)(''19('7.,**%)3: - // PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam - - DiffNode rec1 = node.getElement("30PPJAAXX090125:1:42:512:1817#0_1").getValueAsNode(); - testLeaf(rec1, "NAME", "30PPJAAXX090125:1:42:512:1817#0"); - testLeaf(rec1, "FLAGS", 99); - testLeaf(rec1, "RNAME", "chr1"); - testLeaf(rec1, "POS", 200); - testLeaf(rec1, "MAPQ", 0); - testLeaf(rec1, "CIGAR", "76M"); - testLeaf(rec1, "RNEXT", "chr1"); - testLeaf(rec1, "PNEXT", 255); - testLeaf(rec1, "TLEN", -130); - testLeaf(rec1, "SEQ", "ACCCTAACCCTAACCCTAACCCTAACCATAACCCTAAGACTAACCCTAAACCTAACCCTCATAATCGAAATACAAC"); - testLeaf(rec1, "QUAL", "BBBBC@C?AABCBB<63>=B@>+B9-9+)2B8,+@327B5A>90((>-+''3?(/'''A)(''19('7.,**%)3:"); - testLeaf(rec1, "PG", "0"); - testLeaf(rec1, "RG", "exampleBAM.bam"); - testLeaf(rec1, "SM", "exampleBAM.bam"); - } -} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DifferenceUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DifferenceUnitTest.java deleted file mode 100644 index 685514f34..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DifferenceUnitTest.java +++ /dev/null @@ -1,118 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - - -// the imports for unit testing. - - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffElement; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffNode; -import org.broadinstitute.gatk.engine.walkers.diffengine.Difference; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -/** - * Basic unit test for DifferableReaders in reduced reads - */ -public class DifferenceUnitTest extends BaseTest { - // -------------------------------------------------------------------------------- - // - // testing routines - // - // -------------------------------------------------------------------------------- - - private class DifferenceTest extends TestDataProvider { - public DiffElement tree1, tree2; - public String difference; - - private DifferenceTest(String tree1, String tree2, String difference) { - this(DiffNode.fromString(tree1), DiffNode.fromString(tree2), difference); - } - - private DifferenceTest(DiffElement tree1, DiffElement tree2, String difference) { - super(DifferenceTest.class); - this.tree1 = tree1; - this.tree2 = tree2; - this.difference = difference; - } - - public String toString() { - return String.format("tree1=%s tree2=%s diff=%s", - tree1 == null ? "null" : tree1.toOneLineString(), - tree2 == null ? "null" : tree2.toOneLineString(), - difference); - } - } - - @DataProvider(name = "data") - public Object[][] createTrees() { - new DifferenceTest("A=X", "A=Y", "A:1:X!=Y"); - new DifferenceTest("A=Y", "A=X", "A:1:Y!=X"); - new DifferenceTest(DiffNode.fromString("A=X"), null, "A:1:X!=MISSING"); - new DifferenceTest(null, DiffNode.fromString("A=X"), "A:1:MISSING!=X"); - return DifferenceTest.getTests(DifferenceTest.class); - } - - @Test(enabled = true, dataProvider = "data") - public void testDiffToString(DifferenceTest test) { - logger.warn("Test tree1: " + (test.tree1 == null ? "null" : test.tree1.toOneLineString())); - logger.warn("Test tree2: " + (test.tree2 == null ? "null" : test.tree2.toOneLineString())); - logger.warn("Test expected diff : " + test.difference); - Difference diff = new Difference(test.tree1, test.tree2); - logger.warn("Observed diffs : " + diff); - Assert.assertEquals(diff.toString(), test.difference, "Observed diff string " + diff + " not equal to expected difference string " + test.difference ); - - } -} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/GenotypeSummariesUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/GenotypeSummariesUnitTest.java index 6adc55352..dd18b9d08 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/GenotypeSummariesUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/GenotypeSummariesUnitTest.java @@ -52,6 +52,7 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import htsjdk.variant.variantcontext.*; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import org.testng.Assert; import org.testng.annotations.Test; @@ -82,8 +83,8 @@ public class GenotypeSummariesUnitTest { final GenotypeSummaries GS = new GenotypeSummaries(); final Map resultMap = GS.annotate(null, null, null, null, testVC, null); - Assert.assertEquals(1, resultMap.get(GenotypeSummaries.NCC)); // 1 no-called called sample - Assert.assertEquals(30.0, Double.parseDouble((String)resultMap.get(GenotypeSummaries.GQ_MEAN)), 1E-4); // mean GQ is 30 - Assert.assertFalse(resultMap.containsKey(GenotypeSummaries.GQ_STDDEV)); // no stddev with only one data point + Assert.assertEquals(1, resultMap.get(GATKVCFConstants.NOCALL_CHROM_KEY)); // 1 no-called called sample + Assert.assertEquals(30.0, Double.parseDouble((String)resultMap.get(GATKVCFConstants.GQ_MEAN_KEY)), 1E-4); // mean GQ is 30 + Assert.assertFalse(resultMap.containsKey(GATKVCFConstants.GQ_STDEV_KEY)); // no stddev with only one data point } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumUnitTest.java index cc6207ac8..5242414d7 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumUnitTest.java @@ -51,8 +51,8 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.utils.MannWhitneyU; +import org.broadinstitute.gatk.utils.Utils; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; @@ -81,9 +81,9 @@ public class RankSumUnitTest { makeDistribution(distribution20_40, 40, skew, observations/2); // shuffle the observations - Collections.shuffle(distribution20, GenomeAnalysisEngine.getRandomGenerator()); - Collections.shuffle(distribution30, GenomeAnalysisEngine.getRandomGenerator()); - Collections.shuffle(distribution20_40, GenomeAnalysisEngine.getRandomGenerator()); + Collections.shuffle(distribution20, Utils.getRandomGenerator()); + Collections.shuffle(distribution30, Utils.getRandomGenerator()); + Collections.shuffle(distribution20_40, Utils.getRandomGenerator()); } private static void makeDistribution(final List result, final int target, final int skew, final int numObservations) { diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java index 10946a183..333175938 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -53,13 +53,18 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import htsjdk.tribble.readers.LineIterator; import htsjdk.tribble.readers.PositionalBufferedStream; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; import org.broadinstitute.gatk.engine.walkers.WalkerTest; -import org.broadinstitute.gatk.utils.exceptions.UserException; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFCodec; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import org.testng.Assert; import org.testng.annotations.Test; +import org.apache.commons.io.FileUtils; + import java.io.File; import java.io.FileInputStream; import java.io.IOException; @@ -69,6 +74,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { final static String REF = b37KGReference; final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; + final static String standardAnnotations = " -G Standard -G StandardUG "; public static String baseTestString() { return "-T VariantAnnotator -R " + b36KGReference + " --no_cmdline_in_header -o %s"; @@ -93,16 +99,16 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("62b6dacf131695f81eccbfe2b1efa128")); + baseTestString() + standardAnnotations + "--variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, + Arrays.asList("92eb47332dd9d7ee7fbe3120dc39c594")); executeTest("test file has annotations, asking for annotations, #1", spec); } @Test public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("8cd16a59e4697beb1c6d75d0b82c8cf5")); + baseTestString() + standardAnnotations + "--variant " + privateTestDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, + Arrays.asList("c367bf7cebd7b26305f8d4736788aec8")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -127,39 +133,64 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("6a93ce9ce7bda8de285c9c67d93a0b10")); + baseTestString() + standardAnnotations + "--variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, + Arrays.asList("098dcad8d90d90391755a0191c9db59c")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @Test public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("1554af900d1caee1d85824ee85e54398")); + baseTestString() + standardAnnotations + "--variant " + privateTestDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, + Arrays.asList("f3bbfbc179d2e1bae49890f1e9dfde34")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } @Test public void testExcludeAnnotations() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G Standard -XA FisherStrand -XA ReadPosRankSumTest --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("9c1aa104de4735be4f7e418014b6536b")); + baseTestString() + standardAnnotations + "-XA FisherStrand -XA ReadPosRankSumTest --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, + Arrays.asList("7267450fc4d002f75a24ca17278e0950")); executeTest("test exclude annotations", spec); } + @Test + public void testAskingStrandAlleleCountsBySample() throws IOException{ + String logFileName = new String("testAskingStrandAlleleCountsBySample.log"); + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000 -A StrandAlleleCountsBySample -log " + logFileName, 1, + Arrays.asList("0c0c4a219cb487598fb1fbb77db71eca")); + executeTest("test file has annotations, adding StrandAlleleCountsBySample annotation", spec); + + File file = new File(logFileName); + Assert.assertTrue(FileUtils.readFileToString(file).contains("Annotation will not be calculated, must be called from HaplotyepCaller")); + } + + @Test + public void testAskingGCContent() throws IOException{ + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000 -A GCContent", 1, + Arrays.asList("02f634fd978cf2a66738704581508569")); + final File outputVCF = executeTest("test file has annotations, adding GCContent annotation", spec).getFirst().get(0); + final VCFCodec codec = new VCFCodec(); + final VCFHeader header = (VCFHeader) codec.readActualHeader(codec.makeSourceFromStream(new FileInputStream(outputVCF))); + final VCFHeaderLine infoLineGC = header.getInfoHeaderLine(GATKVCFConstants.GC_CONTENT_KEY); + // GC content must be a Float type + Assert.assertTrue(infoLineGC.toString().contains("Type=Float")); + } + @Test public void testOverwritingHeader() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, - Arrays.asList("06b4127795a67bd26156cc1651f3a98b")); + baseTestString() + standardAnnotations + "--variant " + privateTestDir + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, + Arrays.asList("18592c72d83ee84e1326acb999518c38")); executeTest("test overwriting header", spec); } @Test public void testNoReads() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, + baseTestString() + standardAnnotations + "--variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, Arrays.asList("6de950b381d2d92b21bab6144e8f0714")); executeTest("not passing it any reads", spec); } @@ -167,7 +198,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testDBTagWithDbsnp() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --dbsnp " + b36dbSNP129 + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, + baseTestString() + " --dbsnp " + b36dbSNP129 + standardAnnotations + "--variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, Arrays.asList("e0bd85747c87ea4df6ef67f593cbacbf")); executeTest("getting DB tag with dbSNP", spec); } @@ -175,7 +206,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testMultipleIdsWithDbsnp() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --alwaysAppendDbsnpId --dbsnp " + b36dbSNP129 + " -G Standard --variant " + privateTestDir + "vcfexample3withIDs.vcf -L " + privateTestDir + "vcfexample3withIDs.vcf", 1, + baseTestString() + " --alwaysAppendDbsnpId --dbsnp " + b36dbSNP129 + standardAnnotations + "--variant " + privateTestDir + "vcfexample3withIDs.vcf -L " + privateTestDir + "vcfexample3withIDs.vcf", 1, Arrays.asList("194a942f17104292192fb564a3c96610")); executeTest("adding multiple IDs with dbSNP", spec); } @@ -183,7 +214,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testDBTagWithHapMap() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --comp:H3 " + privateTestDir + "fakeHM3.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, + baseTestString() + " --comp:H3 " + privateTestDir + "fakeHM3.vcf" + standardAnnotations + "--variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, Arrays.asList("9e41ae733a76632b40eda38e3cef909d")); executeTest("getting DB tag with HM3", spec); } @@ -191,7 +222,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testDBTagWithTwoComps() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --comp:H3 " + privateTestDir + "fakeHM3.vcf --comp:foo " + privateTestDir + "fakeHM3.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, + baseTestString() + " --comp:H3 " + privateTestDir + "fakeHM3.vcf --comp:foo " + privateTestDir + "fakeHM3.vcf " + standardAnnotations + " --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, Arrays.asList("7b718bae0444f1896a6e86da80531218")); executeTest("getting DB tag with 2 comps", spec); } @@ -207,15 +238,23 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testUsingExpression() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -E foo.AF -L " + privateTestDir + "vcfexample3empty.vcf", 1, + baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations.vcf" + standardAnnotations + "--variant " + privateTestDir + "vcfexample3empty.vcf -E foo.AF -L " + privateTestDir + "vcfexample3empty.vcf", 1, Arrays.asList("0bed7b4f6ed0556c5e7d398353a9fa91")); executeTest("using expression", spec); } + @Test + public void testUsingExpressionMultiAllele() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations-multiAllele.vcf" + standardAnnotations + "--variant " + privateTestDir + "vcfexample3empty-multiAllele.vcf -E foo.AF -E foo.AC -L " + privateTestDir + "vcfexample3empty-multiAllele.vcf", 1, + Arrays.asList("195cf0f5b1aa5c7d00a0595dcca02f4c")); + executeTest("using expression with multi-alleles", spec); + } + @Test public void testUsingExpressionWithID() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -E foo.ID -L " + privateTestDir + "vcfexample3empty.vcf", 1, + baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations.vcf" + standardAnnotations + "--variant " + privateTestDir + "vcfexample3empty.vcf -E foo.ID -L " + privateTestDir + "vcfexample3empty.vcf", 1, Arrays.asList("b3fe9d3bdb18ca2629543f849a7d27ed")); executeTest("using expression with ID", spec); } @@ -270,7 +309,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { "--snpEffFile " + privateTestDir + "snpEff_unsupported_version_no_gatk_mode.vcf " + "-L 1:10001292-10012424", 1, - UserException.class + Arrays.asList("87cbf53c65ef4498b721f901f87f0161") ); executeTest("Testing SnpEff annotations (unsupported version, no GATK mode)", spec); } @@ -306,12 +345,26 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { executeTest("Testing InbreedingCoeff annotation with PED file", spec); } + @Test(enabled = true) + public void testAlleleTrimming() { + final String MD5 = "5f4b8dcbd4ec3b773486945e5b38e7f3"; + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantAnnotator -R " + b37KGReference + " -A InbreedingCoeff --variant:vcf " + privateTestDir + "alleleTrim.vcf.gz" + + " -L 1:26608870-26608875 -no_cmdline_in_header --resource:exac " + privateTestDir + "exacAlleleTrim.vcf.gz -E exac.AC_Adj" + + " -o %s", 1, + Arrays.asList(MD5)); + executeTest("Testing allele trimming annotation", spec); + } + @Test public void testStrandBiasBySample() throws IOException { + // pipeline 1: create variant via HalotypeCaller with no default annotations final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800"; final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); final File outputVCF = executeTest("testStrandBiasBySample", spec).getFirst().get(0); + // pipeline 2: create variant via HalotypeCaller; include StrandBiasBySample, exclude FisherStrand annotation + // re-Annotate the variant with VariantAnnotator using FisherStrand annotation final String baseNoFS = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -XA FisherStrand -A StrandBiasBySample"; final WalkerTestSpec specNoFS = new WalkerTestSpec(baseNoFS, 1, Arrays.asList("")); specNoFS.disableShadowBCF(); @@ -351,6 +404,39 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { Assert.assertFalse(lineIteratorAnn.hasNext()); } + @Test + public void testStrandAlleleCountsBySample() throws IOException { + final WalkerTestSpec spec = new WalkerTestSpec( + "-T HaplotypeCaller --disableDithering " + + String.format("-R %s -I %s ", REF, CEUTRIO_BAM) + + "--no_cmdline_in_header -o %s -L 20:10130000-10134800 " + + "-A StrandBiasBySample -A StrandAlleleCountsBySample", + 1, Arrays.asList("") + ); + spec.disableShadowBCF(); //TODO: Remove when BaseTest.assertAttributesEquals() works with SC + final File outputVCF = executeTest("testStrandAlleleCountsBySample", spec).getFirst().get(0); + + //Confirm that SB and SAC are identical for bi-allelic variants + final VCFCodec codec = new VCFCodec(); + final FileInputStream s = new FileInputStream(outputVCF); + final LineIterator lineIterator = codec.makeSourceFromStream(new PositionalBufferedStream(s)); + codec.readHeader(lineIterator); + + while (lineIterator.hasNext()) { + final String line = lineIterator.next(); + Assert.assertFalse(line == null); + final VariantContext vc = codec.decode(line); + + if (vc.isBiallelic()) { + for (final Genotype g : vc.getGenotypes()) { + Assert.assertTrue(g.hasExtendedAttribute("SB")); + Assert.assertTrue(g.hasExtendedAttribute("SAC")); + Assert.assertEquals(g.getExtendedAttribute("SB").toString(), g.getExtendedAttribute("SAC").toString()); + } + } + } + } + @Test(enabled = false) public void testQualByDepth() throws IOException { diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/beagle/BeagleIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/beagle/BeagleIntegrationTest.java index 7f84b792d..30ae63869 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/beagle/BeagleIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/beagle/BeagleIntegrationTest.java @@ -67,7 +67,7 @@ public class BeagleIntegrationTest extends WalkerTest { "--beagleR2:BEAGLE " + beagleValidationDataLocation + "inttestbgl.r2 " + "--beagleProbs:BEAGLE " + beagleValidationDataLocation + "inttestbgl.gprobs " + "--beaglePhased:BEAGLE " + beagleValidationDataLocation + "inttestbgl.phased " + - "-o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", 1, Arrays.asList("989449fa3e262b88ba126867fa3ad9fb")); + "-o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", 1, Arrays.asList("1c4f2fed1d452368fa4dfe3e209ebb57")); spec.disableShadowBCF(); executeTest("test BeagleOutputToVCF", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRGathererUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRGathererUnitTest.java index b1d7de93a..ca10b9a1a 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRGathererUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRGathererUnitTest.java @@ -51,10 +51,11 @@ package org.broadinstitute.gatk.tools.walkers.bqsr; +import org.broadinstitute.gatk.engine.recalibration.BQSRGatherer; import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.report.GATKReport; -import org.broadinstitute.gatk.engine.report.GATKReportTable; -import org.broadinstitute.gatk.utils.recalibration.RecalUtils; +import org.broadinstitute.gatk.utils.report.GATKReport; +import org.broadinstitute.gatk.utils.report.GATKReportTable; +import org.broadinstitute.gatk.engine.recalibration.RecalUtils; import org.testng.Assert; import org.testng.annotations.Test; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRIntegrationTest.java index 78174c096..e3bfde076 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRIntegrationTest.java @@ -101,17 +101,20 @@ public class BQSRIntegrationTest extends WalkerTest { private static final String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam"; private static final String HiSeqInterval = "chr1:10,000,000-10,100,000"; + private static final String SimpleCigarMatchMismatchBam = privateTestDir + "simpleCigarMatchMismatch.bam"; + private static final String SimpleCigarMatchMismatchInterval = "1:1-60"; @DataProvider(name = "BQSRTest") public Object[][] createBQSRTestData() { return new Object[][]{ - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "f805a0020eea987b79f314fa99913806")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "86075d3856eb06816a0dd81af55e421f")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "155802237e1fc7a001398b8f4bcf4b72")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "38c7916cc019fe8d134df67639422b42")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "b74e75f3c5aa90bd21af1e20f2ac8c40")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "e564505aea11464de8ed72890d9ea89a")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "380d8be121ffaddd3461ee0ac3d1a76f")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "fc9df1faf67bab70d32f89bcf4fa39db")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "73ec38eb23b1739ecef8194cbb1132a3")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "2d5721193ed4410d1a7d8db467a1fa05")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "16df7f1745f17f190c9fc33c475b91d8")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "01811003ae811ee74c4b8d3eb5e992fe")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "5e0eea6b0b300fbd2edabc3506ad3a60")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "8500b9747c16cb8eb17082163bdb8069")}, + {new BQSRTest(b36KGReference, SimpleCigarMatchMismatchBam, SimpleCigarMatchMismatchInterval, "", "56dfb2918a4cdae3ef9d705a43e85194")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "0b5a8e259e997e4c7b5836d4c28e6f4d")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "281682124584ab384f23359934df0c3b")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "0a92fdff5fd26227c29d34eda5a32f49")}, @@ -193,12 +196,12 @@ public class BQSRIntegrationTest extends WalkerTest { public Object[][] createPRTestData() { List tests = new ArrayList(); - tests.add(new Object[]{1, new PRTest(" -qq -1", "fcc136b877fbde38791533b0f1ae39e4")}); - tests.add(new Object[]{1, new PRTest(" -qq 6", "f21b537c1689b8051b878ea5cc9b61a0")}); - tests.add(new Object[]{1, new PRTest(" -DIQ", "1d04a242bf825177d6a45eff9fbed647")}); + tests.add(new Object[]{1, new PRTest(" -qq -1", "ce09e16466151bb37305dbfd5dc88f35")}); + tests.add(new Object[]{1, new PRTest(" -qq 6", "2d12f3d48b1797ea0671e28a435527fe")}); + tests.add(new Object[]{1, new PRTest(" -DIQ", "f3dbf3ae2725f1e7aa8ae61a09beac51")}); for ( final int nct : Arrays.asList(1, 2, 4) ) { - tests.add(new Object[]{nct, new PRTest("", "b6f343ac69c63cdb49205c13e67297fc")}); + tests.add(new Object[]{nct, new PRTest("", "0746ae12c106a8af0b3b01f22e9efcba")}); } return tests.toArray(new Object[][]{}); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/ReadRecalibrationInfoUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/ReadRecalibrationInfoUnitTest.java index acb06c4ea..3b4243831 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/ReadRecalibrationInfoUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/ReadRecalibrationInfoUnitTest.java @@ -54,7 +54,7 @@ package org.broadinstitute.gatk.tools.walkers.bqsr; import htsjdk.samtools.SAMUtils; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.recalibration.EventType; -import org.broadinstitute.gatk.utils.recalibration.ReadCovariates; +import org.broadinstitute.gatk.engine.recalibration.ReadCovariates; import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.testng.Assert; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java index 9c3168401..8b5e42109 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java @@ -71,11 +71,11 @@ public class DiagnoseTargetsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testSingleSample() { - DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "6ca3d3917a7b65eaa877aa3658d80912"); + DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "5cad1b8e3bf5582842bbeadbc173e8aa"); } @Test(enabled = true) public void testMultiSample() { - DTTest("testMultiSample ", "-I " + multiSample, "f50c6b9bef9f63f0a8b32ae9a9bdd51a"); + DTTest("testMultiSample ", "-I " + multiSample, "c2a11ad34104fd5e4e65bdf049abe5e7"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java index c538d6b81..0816b00f2 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltrationIntegrationTest.java @@ -144,6 +144,14 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { executeTest("test filter with separate names #2", spec); } + @Test + public void testInvertFilter() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --filterName ABF -filter 'AlleleBalance < 0.7' --filterName FSF -filter 'FisherStrand == 1.4' --variant " + privateTestDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000 --invert_filter_expression", 1, + Arrays.asList("d478fd6bcf0884133fe2a47adf4cd765")); + executeTest("test inversion of selection of filter with separate names #2", spec); + } + @Test public void testGenotypeFilters1() { WalkerTestSpec spec1 = new WalkerTestSpec( @@ -194,4 +202,13 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { Arrays.asList("e10485c7c33d9211d0c1294fd7858476")); executeTest("testFilteringDPfromFORMAT", spec); } + + @Test + public void testInvertGenotypeFilterExpression() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference + + " --genotypeFilterExpression 'DP < 8' --genotypeFilterName highDP -V " + privateTestDir + "filteringDepthInFormat.vcf --invert_genotype_filter_expression", 1, + Arrays.asList("d2664870e7145eb73a2295766482c823")); + executeTest("testInvertGenotypeFilterExpression", spec); + } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleListUnitTester.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleListUnitTester.java index 015460696..585bb2f8d 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleListUnitTester.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleListUnitTester.java @@ -52,8 +52,10 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.variant.variantcontext.Allele; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.utils.RandomDNA; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.genotyper.AlleleList; +import org.broadinstitute.gatk.utils.genotyper.IndexedAlleleList; import org.testng.Assert; import org.testng.SkipException; @@ -69,15 +71,15 @@ import java.util.Set; */ public class AlleleListUnitTester { - private static final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + private static final Random rnd = Utils.getRandomGenerator(); private static final RandomDNA rndDNA = new RandomDNA(rnd); /** * Test that the contents of an allele-list are the ones expected. *

    *

    - * This method perform various consistency check involving all the {@link org.broadinstitute.gatk.tools.walkers.genotyper.AlleleList} interface methods. - * Therefore calling this method is equivalent to a thorough check of the {@link org.broadinstitute.gatk.tools.walkers.genotyper.AlleleList} aspect of + * This method perform various consistency check involving all the {@link org.broadinstitute.gatk.utils.genotyper.AlleleList} interface methods. + * Therefore calling this method is equivalent to a thorough check of the {@link org.broadinstitute.gatk.utils.genotyper.AlleleList} aspect of * the {@code actual} argument. *

    * diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleListUtilsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleListUtilsUnitTest.java index ddec1a643..7c3a85d53 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleListUtilsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleListUtilsUnitTest.java @@ -52,7 +52,11 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.variant.variantcontext.Allele; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.genotyper.AlleleList; +import org.broadinstitute.gatk.utils.genotyper.AlleleListPermutation; +import org.broadinstitute.gatk.utils.genotyper.AlleleListUtils; +import org.broadinstitute.gatk.utils.genotyper.IndexedAlleleList; import org.testng.Assert; import org.testng.SkipException; import org.testng.annotations.BeforeClass; @@ -62,7 +66,7 @@ import org.testng.annotations.Test; import java.util.*; /** - * Test {@link org.broadinstitute.gatk.tools.walkers.genotyper.AlleleListUtils}. + * Test {@link org.broadinstitute.gatk.utils.genotyper.AlleleListUtils}. * * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> */ @@ -121,7 +125,7 @@ public class AlleleListUtilsUnitTest { Assert.assertTrue(AlleleListUtils.equals(selfPermutation,originalAlleleList)); } - private final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + private final Random rnd = Utils.getRandomGenerator(); @Test(dataProvider = "singleAlleleListData", dependsOnMethods = "testEquals") public void testSubsetPermutation(final List alleles1) { diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ArtificialReadPileupTestProvider.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ArtificialReadPileupTestProvider.java index 3dab2af2d..d65770d7b 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ArtificialReadPileupTestProvider.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ArtificialReadPileupTestProvider.java @@ -53,14 +53,10 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMReadGroupRecord; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.*; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState; -import org.broadinstitute.gatk.utils.BaseUtils; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.QualityUtils; import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; @@ -247,10 +243,10 @@ public class ArtificialReadPileupTestProvider { double errorProbability = QualityUtils.qualToErrorProb((byte)phredScaledErrorRate); for (int k=0; k < readBases.length; k++) { - if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < errorProbability) { + if (Utils.getRandomGenerator().nextDouble() < errorProbability) { // random offset int offset = BaseUtils.simpleBaseToBaseIndex(readBases[k]); //0..3 - offset += (GenomeAnalysisEngine.getRandomGenerator().nextInt(3)+1); // adds 1,2 or 3 + offset += (Utils.getRandomGenerator().nextInt(3)+1); // adds 1,2 or 3 offset %= 4; readBases[k] = BaseUtils.baseIndexToSimpleBase(offset); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java index 1dd02c80c..830f3681d 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java @@ -53,7 +53,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.samtools.SAMUtils; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.engine.walkers.Walker; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.MathUtils; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculatorUnitTest.java index a8169dc4f..d3a0864da 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculatorUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculatorUnitTest.java @@ -53,8 +53,8 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.GenotypeLikelihoods; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.testng.Assert; import org.testng.annotations.DataProvider; @@ -129,7 +129,7 @@ public class GenotypeLikelihoodCalculatorUnitTest { @Test(dataProvider = "ploidyAndMaximumAlleleAndNewMaximumAlleleData") public void testGenotypeIndexMap(final int ploidy, final int oldAlleleCount, final int newAlleleCount) { - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); final int maxAlleleCount = Math.max(oldAlleleCount,newAlleleCount); final int[] alleleMap = new int[newAlleleCount]; final Map> reverseMap = new HashMap<>(oldAlleleCount); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingDataUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingDataUnitTest.java index 1ec10855f..a96302d0a 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingDataUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingDataUnitTest.java @@ -52,7 +52,10 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.variant.variantcontext.Allele; +import org.broadinstitute.gatk.utils.genotyper.AlleleListUtils; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleListUtils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/HeterogeneousPloidyModel.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/HeterogeneousPloidyModel.java index 3e19d2734..e918602da 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/HeterogeneousPloidyModel.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/HeterogeneousPloidyModel.java @@ -52,8 +52,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; -import org.broadinstitute.gatk.tools.walkers.genotyper.PloidyModel; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; /** * General heterogeneous ploidy model. diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/HomogeneousPloidyModelUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/HomogeneousPloidyModelUnitTest.java index 89038bdf6..1b5d97994 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/HomogeneousPloidyModelUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/HomogeneousPloidyModelUnitTest.java @@ -51,6 +51,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; +import org.broadinstitute.gatk.utils.genotyper.IndexedSampleList; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java index 920e10b09..e19cfca29 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java @@ -53,7 +53,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import org.apache.log4j.Logger; import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils; +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils; import org.broadinstitute.gatk.engine.walkers.Walker; import htsjdk.variant.variantcontext.Allele; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndexedAlleleListUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndexedAlleleListUnitTest.java index 2f8663468..151a2325f 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndexedAlleleListUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndexedAlleleListUnitTest.java @@ -53,6 +53,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.variant.variantcontext.Allele; +import org.broadinstitute.gatk.utils.genotyper.IndexedAlleleList; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -61,7 +62,7 @@ import java.util.*; import static org.broadinstitute.gatk.tools.walkers.genotyper.AlleleListUnitTester.assertAlleleList; /** - * Tests {@link org.broadinstitute.gatk.tools.walkers.genotyper.IndexedSampleList}. + * Tests {@link org.broadinstitute.gatk.utils.genotyper.IndexedSampleList}. * * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> */ diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndexedSampleListUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndexedSampleListUnitTest.java index e79512b1e..098c39c66 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndexedSampleListUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndexedSampleListUnitTest.java @@ -52,14 +52,15 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.genotyper.IndexedSampleList; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.*; /** - * Tests {@link org.broadinstitute.gatk.tools.walkers.genotyper.IndexedSampleList}. + * Tests {@link org.broadinstitute.gatk.utils.genotyper.IndexedSampleList}. * * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> */ @@ -119,7 +120,7 @@ public class IndexedSampleListUnitTest { private static final int[] MAX_SAMPLE_INDEX = { 0, 1, 4, 9, 10000}; - private static final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + private static final Random rnd = Utils.getRandomGenerator(); @DataProvider(name="sampleCountMaxSampleIndexData") diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/InfiniteRandomMatingPopulationModelUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/InfiniteRandomMatingPopulationModelUnitTest.java index 0e10f741a..70fdf5245 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/InfiniteRandomMatingPopulationModelUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/InfiniteRandomMatingPopulationModelUnitTest.java @@ -53,8 +53,8 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.GenotypeLikelihoods; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.genotyper.*; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -93,7 +93,7 @@ public class InfiniteRandomMatingPopulationModelUnitTest { } private AlleleList discardAllelesAtRandom(final AlleleList likelihoods, final int discardAlleleCount) { - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); final ArrayList subset = new ArrayList<>(AlleleListUtils.asList(likelihoods)); for (int i = 0; i < discardAlleleCount; i++) { subset.remove(rnd.nextInt(subset.size())); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/NanoSchedulerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/NanoSchedulerIntegrationTest.java new file mode 100644 index 000000000..bae62b6c5 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/NanoSchedulerIntegrationTest.java @@ -0,0 +1,101 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.genotyper; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +// ********************************************************************************** // +// Note that this class also serves as an integration test for the VariantAnnotator! // +// ********************************************************************************** // + +public class NanoSchedulerIntegrationTest extends WalkerTest { + @DataProvider(name = "NanoSchedulerUGTest") + public Object[][] createNanoSchedulerUGTest() { + List tests = new ArrayList(); + + for ( final int nt : Arrays.asList(1, 2) ) + for ( final int nct : Arrays.asList(1, 2) ) { + tests.add(new Object[]{ "BOTH", "18418ddc2bdbe20c38ece6dd18535be7", nt, nct }); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "NanoSchedulerUGTest") + private void testNanoSchedulerUGTest(final String glm, final String md5, final int nt, final int nct ) { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T UnifiedGenotyper -R " + b37KGReference, + "--no_cmdline_in_header -G none", + "-I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", + "-L 20:10,000,000-10,100,000", + "-glm " + glm, + "--contamination_fraction_to_filter 0.0", + "-nt " + nt, + "-nct " + nct, + "-o %s" + ), + 1, + Arrays.asList(md5) + ); + executeTest(String.format("testUG-glm:%s-nt%d-nct%d", glm, nt, nct), spec); + } + + + +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ReadLikelihoodsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ReadLikelihoodsUnitTest.java new file mode 100644 index 000000000..32ad71f4b --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ReadLikelihoodsUnitTest.java @@ -0,0 +1,857 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.genotyper; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.variant.variantcontext.Allele; +import org.broadinstitute.gatk.utils.genotyper.*; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.testng.Assert; +import org.testng.SkipException; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Test code for {@link ReadLikelihoods} + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class ReadLikelihoodsUnitTest +{ + private static final double EPSILON = 1e-6; + private static final int ODD_READ_START = 101; + private static final int EVEN_READ_START = 1; + + @Test(dataProvider = "dataSets") + public void testInstantiationAndQuery(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods result = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + + Assert.assertEquals(result.sampleCount(), samples.length); + Assert.assertEquals(result.alleleCount(), alleles.length); + + + testSampleQueries(samples, reads, result); + testAlleleQueries(alleles, result); + testLikelihoodMatrixQueries(samples, result, null); + } + + @Test(dataProvider = "dataSets") + public void testLikelihoodFillingAndQuery(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods result = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + final double[][][] likelihoods = fillWithRandomLikelihoods(samples, alleles, result); + testLikelihoodMatrixQueries(samples, result, likelihoods); + } + + private double[][][] fillWithRandomLikelihoods(final String[] samples, final Allele[] alleles, final ReadLikelihoods result) { + final Random rnd = Utils.getRandomGenerator(); + final double[][][] likelihoods = new double[samples.length][alleles.length][]; + for (int s = 0; s < likelihoods.length; s++) { + final ReadLikelihoods.Matrix sampleLikelihoods = result.sampleMatrix(s); + for (int a = 0; a < likelihoods[s].length; a++) { + likelihoods[s][a] = new double[result.sampleReadCount(s)]; + for (int r = 0; r < likelihoods[s][a].length; r++) + sampleLikelihoods.set(a,r,likelihoods[s][a][r] = -Math.abs(rnd.nextGaussian())); + } + } + return likelihoods; + } + + @Test(dataProvider = "dataSets") + public void testBestAlleles(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + fillWithRandomLikelihoods(samples,alleles,original); + final int alleleCount = alleles.length; + for (int s = 0; s < samples.length; s++) { + final int sampleReadCount = original.sampleReadCount(s); + final ReadLikelihoods.Matrix sampleMatrix = original.sampleMatrix(s); + final double[] bestLkArray = new double[sampleReadCount]; + final int[] bestIndexArray = new int[sampleReadCount]; + final double[] confidenceArray = new double[sampleReadCount]; + for (int r = 0; r < sampleReadCount; r++) { + int bestAlleleIndex = -1; + double bestAlleleLk = Double.NEGATIVE_INFINITY; + double secondBestAlleleLk = Double.NEGATIVE_INFINITY; + for (int a = 0; a < alleleCount; a++) { + final double lk = sampleMatrix.get(a,r); + if (lk > bestAlleleLk) { + secondBestAlleleLk = bestAlleleLk; + bestAlleleLk = lk; + bestAlleleIndex = a; + } else if (lk > secondBestAlleleLk) { + secondBestAlleleLk = lk; + } + } + bestLkArray[r] = bestAlleleLk; + confidenceArray[r] = bestAlleleLk - secondBestAlleleLk; + bestIndexArray[r] = bestAlleleIndex; + } + final Collection.BestAllele> bestAlleles = original.bestAlleles(); + for (final ReadLikelihoods.BestAllele bestAllele : bestAlleles) { + final int readIndex = original.readIndex(s,bestAllele.read); + if (readIndex == -1) continue; + Assert.assertEquals(bestLkArray[readIndex],bestAllele.likelihood); + Assert.assertEquals(bestAllele.allele,alleles[bestIndexArray[readIndex]]); + Assert.assertEquals(bestAllele.confidence,confidenceArray[readIndex],EPSILON); + } + } + } + + @Test(dataProvider = "dataSets") + public void testBestAlleleMap(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + fillWithRandomLikelihoods(samples,alleles,original); + final Map> expected = new HashMap<>(alleles.length); + for (final Allele allele : alleles) + expected.put(allele,new ArrayList()); + + final int alleleCount = alleles.length; + for (int s = 0; s < samples.length; s++) { + final int sampleReadCount = original.sampleReadCount(s); + final ReadLikelihoods.Matrix sampleMatrix = original.sampleMatrix(s); + for (int r = 0; r < sampleReadCount; r++) { + int bestAlleleIndex = -1; + double bestAlleleLk = Double.NEGATIVE_INFINITY; + double secondBestAlleleLk = Double.NEGATIVE_INFINITY; + for (int a = 0; a < alleleCount; a++) { + final double lk = sampleMatrix.get(a,r); + if (lk > bestAlleleLk) { + secondBestAlleleLk = bestAlleleLk; + bestAlleleLk = lk; + bestAlleleIndex = a; + } else if (lk > secondBestAlleleLk) { + secondBestAlleleLk = lk; + } + } + if ((bestAlleleLk - secondBestAlleleLk) > ReadLikelihoods.BestAllele.INFORMATIVE_THRESHOLD) + expected.get(alleles[bestAlleleIndex]).add(sampleMatrix.readAt(r)); + } + } + + final Map> actual = original.readsByBestAlleleMap(); + + Assert.assertEquals(actual.size(),alleles.length); + for (final Allele allele : alleles) { + final List expectedList = expected.get(allele); + final List actualList = actual.get(allele); + final Set expectedSet = new HashSet<>(expectedList); + final Set actualSet = new HashSet<>(actualList); + Assert.assertEquals(actualSet,expectedSet); + } + } + + @Test(dataProvider = "dataSets") + public void testFilterPoorlyModeledReads(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + + for (int s = 0; s < samples.length; s++) { + final int sampleReadCount = original.sampleReadCount(s); + for (int r = 0; r < sampleReadCount; r++) { + if ((r & 1) == 0) continue; + for (int a = 0; a < alleles.length; a++) + original.sampleMatrix(s).set(a,r,-10000); + } + } + + final ReadLikelihoods result = original.clone(); + result.filterPoorlyModeledReads(2.0); + + for (int s = 0; s < samples.length; s++) { + final int oldSampleReadCount = original.sampleReadCount(s); + final int newSampleReadCount = result.sampleReadCount(s); + Assert.assertEquals(newSampleReadCount,(oldSampleReadCount + 1) / 2); + final ReadLikelihoods.Matrix newSampleMatrix = result.sampleMatrix(s); + final ReadLikelihoods.Matrix oldSampleMatrix = original.sampleMatrix(s); + for (int r = 0 ; r < newSampleReadCount; r++) { + Assert.assertEquals(original.readIndex(s, result.sampleReads(s).get(r)), r * 2); + for (int a = 0; a < alleles.length; a++) { + Assert.assertEquals(newSampleMatrix.get(a,r),oldSampleMatrix.get(a,r*2)); + } + } + } + } + + @Test(dataProvider = "dataSets") + public void testFilterReadsToOverlap(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + final GenomeLoc evenReadOverlap = locParser.createGenomeLoc(SAM_HEADER.getSequenceDictionary().getSequences().get(0).getSequenceName(),EVEN_READ_START ,EVEN_READ_START ); + fillWithRandomLikelihoods(samples,alleles,original); + final ReadLikelihoods result = original.clone(); + result.filterToOnlyOverlappingUnclippedReads(evenReadOverlap); + final double[][][] newLikelihoods = new double[samples.length][alleles.length][]; + for (int s = 0; s < samples.length ; s++) + for (int a = 0; a < alleles.length; a++) { + newLikelihoods[s][a] = new double[(original.sampleReadCount(s) + 1) / 2]; + final ReadLikelihoods.Matrix sampleMatrix = original.sampleMatrix(s); + for (int r = 0; r < newLikelihoods[s][a].length; r++) { + Assert.assertEquals(result.readIndex(s,sampleMatrix.readAt(r << 1)),r); + newLikelihoods[s][a][r] = sampleMatrix.get(a, r << 1); + } + } + testLikelihoodMatrixQueries(samples,result,newLikelihoods); + } + + @Test(dataProvider = "marginalizationDataSets") + public void testMarginalizationWithOverlap(final String[] samples, final Allele[] alleles, final Map> reads, final Map> newToOldAlleleMapping) { + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + final GenomeLoc evenReadOverlap = locParser.createGenomeLoc(SAM_HEADER.getSequenceDictionary().getSequences().get(0).getSequenceName(),EVEN_READ_START ,EVEN_READ_START ); + fillWithRandomLikelihoods(samples, alleles, original); + final ReadLikelihoods marginalized = original.marginalize(newToOldAlleleMapping,evenReadOverlap); + Assert.assertNotNull(marginalized); + Assert.assertEquals(newToOldAlleleMapping.size(),marginalized.alleleCount()); + for (int a = 0; a < marginalized.alleleCount(); a++) { + final List oldAlleles = newToOldAlleleMapping.get(marginalized.alleleAt(a)); + Assert.assertNotNull(oldAlleles); + for (int s = 0; s < samples.length; s++) { + final ReadLikelihoods.Matrix oldSmapleLikelihoods = original.sampleMatrix(s); + final ReadLikelihoods.Matrix sampleLikelihoods = marginalized.sampleMatrix(s); + final int sampleReadCount = sampleLikelihoods.readCount(); + final int oldSampleReadCount = oldSmapleLikelihoods.readCount(); + Assert.assertEquals(sampleReadCount,(oldSampleReadCount + 1) / 2); + for (int r = 0; r < sampleReadCount; r++) { + double oldBestLk = Double.NEGATIVE_INFINITY; + for (final Allele oldAllele : oldAlleles) { + oldBestLk = Math.max(oldSmapleLikelihoods.get(original.alleleIndex(oldAllele),r << 1), oldBestLk); + } + Assert.assertEquals(sampleLikelihoods.get(a,r),oldBestLk); + } + } + } + } + + @Test(dataProvider = "marginalizationDataSets") + public void testMarginalization(final String[] samples, final Allele[] alleles, final Map> reads, final Map> newToOldAlleleMapping) { + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + fillWithRandomLikelihoods(samples, alleles, original); + final ReadLikelihoods marginalized = original.marginalize(newToOldAlleleMapping); + Assert.assertNotNull(marginalized); + Assert.assertEquals(newToOldAlleleMapping.size(),marginalized.alleleCount()); + for (int a = 0; a < marginalized.alleleCount(); a++) { + final List oldAlleles = newToOldAlleleMapping.get(marginalized.alleleAt(a)); + Assert.assertNotNull(oldAlleles); + for (int s = 0; s < samples.length; s++) { + final ReadLikelihoods.Matrix oldSmapleLikelihoods = original.sampleMatrix(s); + final ReadLikelihoods.Matrix sampleLikelihoods = marginalized.sampleMatrix(s); + final int sampleReadCount = sampleLikelihoods.readCount(); + final int oldSampleReadCount = oldSmapleLikelihoods.readCount(); + Assert.assertEquals(oldSampleReadCount,sampleReadCount); + for (int r = 0; r < sampleReadCount; r++) { + double oldBestLk = Double.NEGATIVE_INFINITY; + for (final Allele oldAllele : oldAlleles) { + oldBestLk = Math.max(oldSmapleLikelihoods.get(original.alleleIndex(oldAllele),r), oldBestLk); + } + Assert.assertEquals(sampleLikelihoods.get(a,r),oldBestLk); + } + } + } + } + + @Test(dataProvider = "dataSets") + public void testNormalizeBestToZero(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); + final ReadLikelihoods result= original.clone(); + result.normalizeLikelihoods(true, Double.NEGATIVE_INFINITY); + testAlleleQueries(alleles,result); + final int alleleCount = alleles.length; + final double[][][] newLikelihoods = new double[originalLikelihoods.length][alleles.length][]; + for (int s = 0; s < samples.length; s++) { + final int sampleReadCount = original.sampleReadCount(s); + for (int a = 0; a < alleleCount; a++) + newLikelihoods[s][a] = new double[sampleReadCount]; + for (int r = 0; r < sampleReadCount; r++) { + double bestLk = originalLikelihoods[s][0][r]; + for (int a = 1; a < alleleCount; a++) { + bestLk = Math.max(bestLk,originalLikelihoods[s][a][r]); + } + for (int a = 0; a < alleleCount; a++) { + newLikelihoods[s][a][r] = originalLikelihoods[s][a][r] - bestLk; + } + } + } + testLikelihoodMatrixQueries(samples,result,newLikelihoods); + } + + @Test(dataProvider = "dataSets") + public void testNormalizeCapWorstLK(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); + final ReadLikelihoods result= original.clone(); + result.normalizeLikelihoods(false, - 0.001); + testAlleleQueries(alleles,result); + final int alleleCount = alleles.length; + final double[][][] newLikelihoods = new double[originalLikelihoods.length][alleles.length][]; + for (int s = 0; s < samples.length; s++) { + final int sampleReadCount = original.sampleReadCount(s); + for (int a = 0; a < alleleCount; a++) + newLikelihoods[s][a] = new double[sampleReadCount]; + for (int r = 0; r < sampleReadCount; r++) { + double bestAltLk = Double.NEGATIVE_INFINITY; + for (int a = 0; a < alleleCount; a++) { + if (alleles[a].isReference()) + continue; + bestAltLk = Math.max(bestAltLk,originalLikelihoods[s][a][r]); + } + if (bestAltLk == Double.NEGATIVE_INFINITY) + for (int a = 0; a < alleleCount; a++) { + newLikelihoods[s][a][r] = originalLikelihoods[s][a][r]; + } + else + for (int a = 0; a < alleleCount; a++) { + newLikelihoods[s][a][r] = Math.max(originalLikelihoods[s][a][r],bestAltLk - 0.001); + } + } + } + testLikelihoodMatrixQueries(samples,result,newLikelihoods); + } + + @Test(dataProvider = "dataSets") + public void testNormalizeCapWorstLKAndBestToZero(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); + final ReadLikelihoods result= original.clone(); + result.normalizeLikelihoods(true, - 0.001); + testAlleleQueries(alleles,result); + final int alleleCount = alleles.length; + final double[][][] newLikelihoods = new double[originalLikelihoods.length][alleles.length][]; + for (int s = 0; s < samples.length; s++) { + final int sampleReadCount = original.sampleReadCount(s); + for (int a = 0; a < alleleCount; a++) + newLikelihoods[s][a] = new double[sampleReadCount]; + for (int r = 0; r < sampleReadCount; r++) { + double bestAltLk = Double.NEGATIVE_INFINITY; + double bestLk = Double.NEGATIVE_INFINITY; + for (int a = 0; a < alleleCount; a++) { + bestLk = Math.max(bestLk,originalLikelihoods[s][a][r]); + if (alleles[a].isReference()) + continue; + bestAltLk = Math.max(bestAltLk,originalLikelihoods[s][a][r]); + } + if (bestAltLk == Double.NEGATIVE_INFINITY) + for (int a = 0; a < alleleCount; a++) { + newLikelihoods[s][a][r] = originalLikelihoods[s][a][r] - bestLk; + } + else + for (int a = 0; a < alleleCount; a++) { + newLikelihoods[s][a][r] = Math.max(originalLikelihoods[s][a][r],bestAltLk - 0.001) - bestLk; + } + } + } + testLikelihoodMatrixQueries(samples,result,newLikelihoods); + } + + + @Test(dataProvider = "dataSets") + public void testAddMissingAlleles(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); + final ReadLikelihoods result = original.clone(); + + // If all the alleles pass are present in the read-likelihoods collection there is no change. + result.addMissingAlleles(result.alleles(),Double.NEGATIVE_INFINITY); + testLikelihoodMatrixQueries(samples,result,originalLikelihoods); + + // If the allele list passed is empty there is no effect. + result.addMissingAlleles(Collections.EMPTY_LIST,Double.NEGATIVE_INFINITY); + testLikelihoodMatrixQueries(samples,result,originalLikelihoods); + + final Allele newOne; + final Allele newTwo; + final Allele newThree; + + // We add a single missing. + result.addMissingAlleles(Arrays.asList(newOne = Allele.create("ACCCCCAAAATTTAAAGGG".getBytes(),false)),-12345.6); + Assert.assertEquals(result.alleleCount(), original.alleleCount() + 1); + + // We add too more amongst exisisting alleles: + result.addMissingAlleles(Arrays.asList(newTwo = Allele.create("ATATATTATATTAATATT".getBytes(), false),result.alleleAt(1), + result.alleleAt(0),newThree = Allele.create("TGTGTGTATTG".getBytes(),false),Allele.create("ACCCCCAAAATTTAAAGGG".getBytes(),false)),-6.54321); + + Assert.assertEquals(original.alleleCount()+3,result.alleleCount()); + + final List expectedAlleles = new ArrayList<>(original.alleles()); + expectedAlleles.add(newOne); expectedAlleles.add(newTwo); expectedAlleles.add(newThree); + + Assert.assertEquals(result.alleles(),expectedAlleles); + + final double[][][] newLikelihoods = new double[originalLikelihoods.length][][]; + for (int s = 0; s < samples.length; s++) { + newLikelihoods[s] = Arrays.copyOf(originalLikelihoods[s],originalLikelihoods[s].length + 3); + final int sampleReadCount = original.sampleReadCount(s); + final int originalAlleleCount = originalLikelihoods[s].length; + newLikelihoods[s][originalAlleleCount] = new double[sampleReadCount]; + Arrays.fill(newLikelihoods[s][originalAlleleCount],-12345.6); + newLikelihoods[s][originalAlleleCount+1] = new double[sampleReadCount]; + Arrays.fill(newLikelihoods[s][originalAlleleCount+1],-6.54321); + newLikelihoods[s][originalAlleleCount+2] = new double[sampleReadCount]; + Arrays.fill(newLikelihoods[s][originalAlleleCount+2],-6.54321); + } + testLikelihoodMatrixQueries(samples,result,newLikelihoods); + } + + + @Test(dataProvider = "dataSets") + public void testAddNonRefAllele(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); + final ReadLikelihoods result = original.clone(); + result.addNonReferenceAllele(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); + Assert.assertEquals(result.alleleCount(),original.alleleCount() + 1); + Assert.assertEquals(result.alleleIndex(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE),result.alleleCount() - 1); + final double[][][] newLikelihoods = new double[originalLikelihoods.length][][]; + for (int s = 0; s < samples.length; s++) { + newLikelihoods[s] = Arrays.copyOf(originalLikelihoods[s],originalLikelihoods[s].length + 1); + final int sampleReadCount = original.sampleReadCount(s); + final int ordinaryAlleleCount = originalLikelihoods[s].length; + newLikelihoods[s][ordinaryAlleleCount] = new double[sampleReadCount]; + for (int r = 0; r < sampleReadCount; r++) { + double bestLk = newLikelihoods[s][0][r]; + double secondBestLk = Double.NEGATIVE_INFINITY; + for (int a = 1; a < ordinaryAlleleCount; a++) { + final double lk = originalLikelihoods[s][a][r]; + if (lk > bestLk) { + secondBestLk = bestLk; + bestLk = lk; + } else if (lk > secondBestLk) { + secondBestLk = lk; + } + } + final double expectedNonRefLk = Double.isInfinite(secondBestLk) ? bestLk : secondBestLk; + newLikelihoods[s][ordinaryAlleleCount][r] = expectedNonRefLk; + } + } + testLikelihoodMatrixQueries(samples,result,newLikelihoods); + } + + private void testLikelihoodMatrixQueries(String[] samples, ReadLikelihoods result, final double[][][] likelihoods) { + for (final String sample : samples) { + final int sampleIndex = result.sampleIndex(sample); + final int sampleReadCount = result.sampleReadCount(sampleIndex); + final int alleleCount = result.alleleCount(); + Assert.assertEquals(result.alleleCount(), alleleCount); + for (int a = 0; a < alleleCount; a++) { + Assert.assertEquals(result.sampleReadCount(sampleIndex),sampleReadCount); + for (int r = 0; r < sampleReadCount; r++) + Assert.assertEquals(result.sampleMatrix(sampleIndex).get(a,r), + likelihoods == null ? 0.0 : likelihoods[sampleIndex][a][r], EPSILON); + } + } + } + + private void testAlleleQueries(Allele[] alleles, ReadLikelihoods result) { + final Set alleleIndices = new HashSet<>(); + for (final Allele allele : alleles) { + final int alleleIndex = result.alleleIndex(allele); + Assert.assertTrue(alleleIndex >= 0); + Assert.assertFalse(alleleIndices.contains(alleleIndex)); + alleleIndices.add(alleleIndex); + Assert.assertSame(allele,alleles[alleleIndex]); + } + } + + private void testSampleQueries(String[] samples, Map> reads, ReadLikelihoods result) { + final Set sampleIds = new HashSet<>(samples.length); + for (final String sample : samples) { + final int sampleIndex = result.sampleIndex(sample); + Assert.assertTrue(sampleIndex >= 0); + Assert.assertFalse(sampleIds.contains(sampleIndex)); + sampleIds.add(sampleIndex); + + final List sampleReads = result.sampleReads(sampleIndex); + final Set sampleReadsSet = new HashSet<>(sampleReads); + final List expectedSampleReadArray = reads.get(sample); + final Set expectedSampleReadsSet = new HashSet<>(expectedSampleReadArray); + Assert.assertEquals(sampleReadsSet,expectedSampleReadsSet); + + final int sampleReadCount = sampleReads.size(); + for (int r = 0; r < sampleReadCount; r++) { + Assert.assertSame(sampleReads.get(r), expectedSampleReadArray.get(r)); + final int readIndex = result.readIndex(sampleIndex, sampleReads.get(r)); + Assert.assertEquals(readIndex,r); + } + } + } + + private String[][] SAMPLE_SETS = new String[][] { + {"A","B","C"}, + {"A"}, + {"C","A","D","E","Salsa","Gazpacho"}, + }; + + private Allele[][] ALLELE_SETS = new Allele[][] { + {Allele.create("A",true), Allele.create("T"), Allele.create("C")}, + {Allele.create("A",true)}, + {Allele.create("ATTTA"), Allele.create("A",true)}, + {Allele.create("A"), Allele.create("AT",true)}, + {Allele.create("A",false), Allele.create("AT",false)}, + }; + + @DataProvider(name="marginalizationDataSets") + public Object[][] marginalizationDataSets() { + try { + final Random rnd = Utils.getRandomGenerator(); + final Object[][] result = new Object[SAMPLE_SETS.length * ALLELE_SETS.length * ALLELE_SETS.length][]; + int nextIndex = 0; + for (int s = 0; s < SAMPLE_SETS.length; s++) { + for (int a = 0; a < ALLELE_SETS.length; a++) { + for (int b = 0; b < ALLELE_SETS.length; b++) { + if (ALLELE_SETS[b].length < ALLELE_SETS[a].length) + result[nextIndex++] = new Object[]{SAMPLE_SETS[s], ALLELE_SETS[a], + dataSetReads(SAMPLE_SETS[s], rnd), randomAlleleMap(ALLELE_SETS[a], ALLELE_SETS[b]) + }; + } + } + } + return Arrays.copyOf(result,nextIndex); + }catch (final Throwable e) { + throw new RuntimeException(e); + } + } + + private Map> randomAlleleMap(final Allele[] fromAlleles, final Allele[] toAlleles) { + final Map> result = new HashMap<>(toAlleles.length); + for (final Allele toAllele : toAlleles ) + result.put(toAllele,new ArrayList(fromAlleles.length)); + final ArrayList remaining = new ArrayList<>(Arrays.asList(fromAlleles)); + int nextToIndex = 0; + final Random rnd = Utils.getRandomGenerator(); + for (int i = 0; i < fromAlleles.length; i++) { + final int fromAlleleIndex = rnd.nextInt(remaining.size()); + result.get(toAlleles[nextToIndex]).add(remaining.remove(fromAlleleIndex)); + nextToIndex = (nextToIndex + 1) % toAlleles.length; + } + return result; + } + + + @DataProvider(name="dataSets") + public Object[][] dataSets() { + try { + final Random rnd = Utils.getRandomGenerator(); + final Object[][] result = new Object[SAMPLE_SETS.length * ALLELE_SETS.length][]; + int nextIndex = 0; + for (int s = 0; s < SAMPLE_SETS.length; s++) + for (int a = 0; a < ALLELE_SETS.length; a++) { + result[nextIndex++] = new Object[]{SAMPLE_SETS[s], ALLELE_SETS[a], + dataSetReads(SAMPLE_SETS[s], rnd) + }; + } + return result; + }catch (final Throwable e) { + throw new RuntimeException(e); + } + } + + private Map> dataSetReads(final String[] samples, + final Random rnd) { + final Map> result = new HashMap<>(samples.length); + for (final String sample : samples) { + final int readCount = rnd.nextInt(100); + final List reads = new ArrayList<>(readCount); + for (int r = 0; r < readCount; r++) { + final int alignmentStart = (r & 1) == 0 ? EVEN_READ_START : ODD_READ_START; + reads.add(ArtificialSAMUtils.createArtificialRead(SAM_HEADER, + "RRR" + sample + "00" + r, 0, alignmentStart ,"AAAAA".getBytes(), new byte[] {30,30,30,30,30}, "5M")); + } + result.put(sample,reads); + } + return result; + } + + @Test(dataProvider="readCountsAndAlleleCountDataSkippingNoAlleleAndWithReference") + public void testInstantiationAndBasicQueries(final int[] readCounts, final int alleleCount, final boolean hasReference) { + final SampleList sampleList = sampleList(readCounts); + + final AlleleList alleleList = alleleList(alleleCount,hasReference); + final Map> sampleToReads = ReadLikelihoodsUnitTester.sampleToReads(sampleList, readCounts); + final ReadLikelihoods subject = new ReadLikelihoods<>(sampleList,alleleList,sampleToReads); + + AlleleListUnitTester.assertAlleleList(subject, AlleleListUtils.asList(alleleList)); + SampleListUnitTester.assertSampleList(subject,SampleListUtils.asList(sampleList)); + + if (hasReference) { + final int referenceIndex = AlleleListUtils.indexOfReference(alleleList); + Assert.assertTrue(referenceIndex >= 0); + Assert.assertEquals(AlleleListUtils.indexOfReference(alleleList),referenceIndex); + } else { + Assert.assertEquals(AlleleListUtils.indexOfReference(subject), -1); + } + + testLikelihoodMatrixQueries(alleleList, sampleList, sampleToReads, subject); + testAlleleQueries(alleleList, subject); + testSampleQueries(sampleList, sampleToReads, subject); + } + + @Test(dataProvider="readCountsAndAlleleCountDataSkippingNoLikelihoodsOrNoAlleleAndWithReference") + public void testLikelihoodWriting(final int[] readCounts, final int alleleCount, final boolean hasReference) { + final SampleList sampleList = sampleList(readCounts); + + final AlleleList alleleList = alleleList(alleleCount,hasReference); + final Map> sampleToReads = ReadLikelihoodsUnitTester.sampleToReads(sampleList,readCounts); + final ReadLikelihoods subject = new ReadLikelihoods<>(sampleList,alleleList,sampleToReads); + + final int sampleCount = readCounts.length; + int totalLikelihoodsSet = 0; + int expectedLikelihoodsSet = 0; + for (int s = 0; s < sampleCount; s++) { + expectedLikelihoodsSet += readCounts[s] * alleleCount; + final ReadLikelihoods.Matrix matrix = subject.sampleMatrix(s); + final int readCount = matrix.readCount(); + for (int a = 0; a < alleleCount; a++) + for (int r = 0; r < readCount; r++) { + final double likelihood = testLikelihood(s, a, r); + Assert.assertNotEquals(likelihood,0); //Paranoia + totalLikelihoodsSet++; + matrix.set(a,r,likelihood); + Assert.assertEquals(matrix.get(a, r),likelihood); + } + + } + Assert.assertEquals(totalLikelihoodsSet,expectedLikelihoodsSet); + } + + @Test(dependsOnMethods={"testLikelihoodWriting","testInstantiationAndBasicQueries"}, + dataProvider="readCountsAndAlleleCountDataSkippingNoAlleleAndWithReference") + public void testMapConversion(final int[] readCounts, final int alleleCount, final boolean hasReference) { + final SampleList sampleList = sampleList(readCounts); + + final AlleleList alleleList = alleleList(alleleCount,hasReference); + final Map> sampleToReads = ReadLikelihoodsUnitTester.sampleToReads(sampleList,readCounts); + + final Set alleleWithLikelihoodsSet = new HashSet<>(); + final Set readsWithLikelihoodsSet = new HashSet<>(); + final Map map = new HashMap<>(sampleList.sampleCount()); + final int sampleCount = sampleList.sampleCount(); + for (int s = 0; s < sampleCount; s++) { + final String sample = sampleList.sampleAt(s); + final PerReadAlleleLikelihoodMap perSampleMap = new PerReadAlleleLikelihoodMap(); + final List reads = sampleToReads.get(sample); + for (int a = 0; a < alleleCount; a++) + for (int r = 0; r < reads.size(); r++) { + perSampleMap.add(reads.get(r), alleleList.alleleAt(a), testLikelihood(s, a, r)); + alleleWithLikelihoodsSet.add(alleleList.alleleAt(a)); + readsWithLikelihoodsSet.add(reads.get(r)); + } + map.put(sample,perSampleMap); + + } + + ReadLikelihoods subject = ReadLikelihoods.fromPerAlleleReadLikelihoodsMap(map); + + for (int s = 0; s < sampleCount; s++) { + final String sample = sampleList.sampleAt(s); + final int sIndex = subject.sampleIndex(sample); + Assert.assertTrue(sIndex >= 0); + Assert.assertTrue(sIndex < sampleCount); + final int sampleReadCount = sampleToReads.get(sample).size(); + final ReadLikelihoods.Matrix sampleLikelihoods = subject.sampleMatrix(sIndex); + for (int a = 0; a < alleleCount; a++) { + final Allele allele = alleleList.alleleAt(a); + final int aIndex = subject.alleleIndex(allele); + Assert.assertEquals(aIndex >= 0,alleleWithLikelihoodsSet.contains(allele)); + Assert.assertTrue(aIndex < alleleCount); + if (aIndex == -1) continue; + for (int r = 0; r < sampleReadCount; r++) { + final GATKSAMRecord read = sampleToReads.get(sample).get(r); + final int rIndex = subject.readIndex(sIndex,read); + final int rIndex2 = sampleLikelihoods.readIndex(read); + Assert.assertEquals(rIndex,rIndex2); + Assert.assertEquals(rIndex >= 0,readsWithLikelihoodsSet.contains(read)); + Assert.assertTrue(rIndex < sampleReadCount); + if (rIndex == -1) + continue; + final double likelihood = sampleLikelihoods.get(aIndex,rIndex); + Assert.assertEquals(likelihood,testLikelihood(s,a,r)); + } + } + } + } + + private double testLikelihood(final int sampleIndex, final int alleleIndex, final int readIndex) { + return - Math.abs(31 * (sampleIndex + 1) + 101 * alleleIndex + 1009 * readIndex); + } + + + private final Random rnd = Utils.getRandomGenerator(); + + private void testLikelihoodMatrixQueries(final AlleleList alleles, final SampleList samples, + final Map> sampleToReads, ReadLikelihoods result) { + for (final String sample : SampleListUtils.asList(samples)) { + final int sampleIndex = result.sampleIndex(sample); + final ReadLikelihoods.Matrix likelihoodMatrix = result.sampleMatrix(sampleIndex); + final int sampleReadCount = sampleToReads.get(sample).size(); + final List reads = sampleToReads.get(sample); + Assert.assertEquals(likelihoodMatrix.alleleCount(), alleles.alleleCount()); + Assert.assertEquals(likelihoodMatrix.readCount(), sampleReadCount); + for (int a = 0; a < likelihoodMatrix.alleleCount(); a++) { + Assert.assertEquals(likelihoodMatrix.alleleAt(a),alleles.alleleAt(a)); + for (int r = 0; r < sampleReadCount; r++) { + Assert.assertEquals(likelihoodMatrix.readAt(r),reads.get(r)); + Assert.assertEquals(likelihoodMatrix.get(a, r), 0.0); + } + } + } + } + + private void testAlleleQueries(final AlleleList alleles, ReadLikelihoods result) { + final Set alleleIndices = new HashSet<>(); + for (final Allele allele : AlleleListUtils.asList(alleles)) { + final int alleleIndex = result.alleleIndex(allele); + Assert.assertTrue(alleleIndex >= 0); + Assert.assertFalse(alleleIndices.contains(alleleIndex)); + alleleIndices.add(alleleIndex); + Assert.assertSame(allele,alleles.alleleAt(alleleIndex)); + } + } + + private void testSampleQueries(final SampleList samples, Map> reads, + final ReadLikelihoods result) { + final Set sampleIds = new HashSet<>(samples.sampleCount()); + for (final String sample : SampleListUtils.asList(samples)) { + final int sampleIndex = result.sampleIndex(sample); + Assert.assertTrue(sampleIndex >= 0); + Assert.assertFalse(sampleIds.contains(sampleIndex)); + sampleIds.add(sampleIndex); + + final List sampleReads = result.sampleReads(sampleIndex); + final Set sampleReadsSet = new HashSet<>(sampleReads); + final List expectedSampleReadArray = reads.get(sample); + final Set expectedSampleReadsSet = new HashSet<>(expectedSampleReadArray); + Assert.assertEquals(sampleReadsSet,expectedSampleReadsSet); + + final int sampleReadCount = sampleReads.size(); + for (int r = 0; r < sampleReadCount; r++) { + Assert.assertSame(sampleReads.get(r), expectedSampleReadArray.get(r)); + final int readIndex = result.readIndex(sampleIndex, sampleReads.get(r)); + Assert.assertEquals(readIndex,r); + } + } + } + + private AlleleList alleleList(final int alleleCount, final boolean hasReference) { + final Allele[] alleles = AlleleListUnitTester.generateRandomAlleles(alleleCount,100); + if (hasReference) { + final int referenceIndex = rnd.nextInt(alleleCount); + alleles[referenceIndex] = Allele.create(alleles[referenceIndex].getBases(),true); + } + final AlleleList alleleList = new IndexedAlleleList<>(alleles); + if (alleleList.alleleCount() != alleles.length) + throw new SkipException("repeated alleles, should be infrequent"); + return alleleList; + } + + private SAMFileHeader SAM_HEADER = ArtificialSAMUtils.createArtificialSamHeader(10, 0, 1000); + final GenomeLocParser locParser = new GenomeLocParser(SAM_HEADER.getSequenceDictionary()); + + + private int[][] READ_COUNTS = new int[][] { + {}, + { 100 }, + { 0 }, + { 0, 0, 0 }, + { 1, 0, 1 }, + { 100, 10 , 100}, + { 1000, 10, 100, 20, 23 } + }; + + private int[] ALLELE_COUNTS = new int[] { 0, 1, 2, 3, 10, 20 }; + + @DataProvider(name="readCountsAndAlleleCountData") + public Object[][] readCountsAndAlleleCountData() { + final Object[][] result = new Object[READ_COUNTS.length * ALLELE_COUNTS.length * 2][]; + int index = 0; + for (final int[] readCounts : READ_COUNTS) + for (final int alleleCount : ALLELE_COUNTS) { + result[index++] = new Object[]{ readCounts, alleleCount, false}; + result[index++] = new Object[]{ readCounts, alleleCount, true}; + } + return result; + } + + @DataProvider(name="readCountsAndAlleleCountDataSkippingNoAlleleAndWithReference") + public Object[][] readCountsAndAlleleCountDataSkippingNoAlleleAndWithReference() { + final Object[][] raw = readCountsAndAlleleCountData(); + final List result = new ArrayList<>(raw.length); + for (final Object[] paramSet : raw) + if (!paramSet[2].equals(true) || !paramSet[1].equals(0)) + result.add(paramSet); + return result.toArray(new Object[result.size()][]); + } + + @DataProvider(name="readCountsAndAlleleCountDataSkippingNoLikelihoodsOrNoAlleleAndWithReference") + public Object[][] readCountsAndAlleleCountDataSkippingNoLikelihoodsOrNoAlleleAndWithReference() { + final Object[][] raw = readCountsAndAlleleCountDataSkippingNoAlleleAndWithReference(); + final List result = new ArrayList<>(raw.length); + for (final Object[] paramSet : raw) { + final int[] readCounts = (int[]) paramSet[0]; + final long totalReadCount = MathUtils.sum(readCounts); + if (totalReadCount > 0) + result.add(paramSet); + } + return result.toArray(new Object[result.size()][]); + } + + private SampleList sampleList(final int[] readCounts) { + final List samples = new ArrayList<>(readCounts.length); + for (int i = 0; i < readCounts.length; i++) + samples.add("SAMPLE_" + i); + return new IndexedSampleList(samples); + } + +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ReadLikelihoodsUnitTester.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ReadLikelihoodsUnitTester.java index 44f8279e1..0aededd99 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ReadLikelihoodsUnitTester.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ReadLikelihoodsUnitTester.java @@ -54,7 +54,9 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.samtools.SAMFileHeader; import htsjdk.variant.variantcontext.Allele; +import org.broadinstitute.gatk.utils.genotyper.AlleleList; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; +import org.broadinstitute.gatk.utils.genotyper.SampleList; import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/SampleListUnitTester.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/SampleListUnitTester.java index f66990d75..c79acccbc 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/SampleListUnitTester.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/SampleListUnitTester.java @@ -51,8 +51,8 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; -import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedSampleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.IndexedSampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; import org.testng.Assert; import java.util.*; @@ -68,8 +68,8 @@ public class SampleListUnitTester { * Test that the contents of a sample-list are the ones expected. * *

    - * This method perform various consistency check involving all the {@link org.broadinstitute.gatk.tools.walkers.genotyper.SampleList} interface methods. - * Therefore calling this method is equivalent to a thorough check of the {@link org.broadinstitute.gatk.tools.walkers.genotyper.SampleList} aspect of + * This method perform various consistency check involving all the {@link org.broadinstitute.gatk.utils.genotyper.SampleList} interface methods. + * Therefore calling this method is equivalent to a thorough check of the {@link org.broadinstitute.gatk.utils.genotyper.SampleList} aspect of * the {@code actual} argument. *

    * diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/SampleListUtilsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/SampleListUtilsUnitTest.java index 565d0cc47..4575e62c1 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/SampleListUtilsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/SampleListUtilsUnitTest.java @@ -51,9 +51,9 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; -import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedSampleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleListUtils; +import org.broadinstitute.gatk.utils.genotyper.IndexedSampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleListUtils; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; @@ -64,7 +64,7 @@ import java.util.Arrays; import java.util.List; /** - * Test {@link org.broadinstitute.gatk.tools.walkers.genotyper.AlleleListUtils}. + * Test {@link org.broadinstitute.gatk.utils.genotyper.AlleleListUtils}. * * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> */ diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/StandardCallerArgumentCollectionUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/StandardCallerArgumentCollectionUnitTest.java index 780ab3e0d..4781488a9 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/StandardCallerArgumentCollectionUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/StandardCallerArgumentCollectionUnitTest.java @@ -51,9 +51,9 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.arguments.StandardCallerArgumentCollection; +import org.broadinstitute.gatk.tools.walkers.genotyper.StandardCallerArgumentCollection; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCallerArgumentCollection; +import org.broadinstitute.gatk.utils.Utils; import org.testng.SkipException; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -151,7 +151,7 @@ public class StandardCallerArgumentCollectionUnitTest { public T randomArgumentCollection(final Class clazz) throws IllegalAccessException, InstantiationException { final T result = clazz.newInstance(); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); for (final Field field : clazz.getFields()) { final int fieldModifiers = field.getModifiers(); if (!Modifier.isPublic(fieldModifiers)) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java index 3f38d20e9..4d7b1568b 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java @@ -65,6 +65,8 @@ import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.FixedAFCalculatorP import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleListUtils; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java index ff8143a84..a3458305b 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java @@ -74,7 +74,7 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe @Test(enabled = true) public void testBOTH_GGA_Pools() { - executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "71b196793025aa1f99cb8f6f9929d0bf"); + executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "972c8db4b1cc971bd714fd9c1a72b65a"); } @Test(enabled = true) @@ -84,6 +84,6 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "ea5b4124be3ab15a14b670506a98fd9b"); + executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "6007e0735aa5a680da92396345824077"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java index 005387bc9..a2f2262fa 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java @@ -63,7 +63,7 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","e0060cbb6d5e1af3b274a1e577ba47a9"); + executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","fdcdfbed14fb7d703cd991ee7d2821a6"); } @Test(enabled = true) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java index a63af7f44..8475cbd18 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java @@ -140,7 +140,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, - Arrays.asList("053913cb29fee481158e1f497a4fffdc")); + Arrays.asList("781d305993aedcc1a4c199a5c63ac54c")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index e975c11d0..de8ac514c 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -54,9 +54,9 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.samtools.util.BlockCompressedInputStream; import htsjdk.tribble.readers.AsciiLineReader; import org.broadinstitute.gatk.engine.walkers.WalkerTest; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; import org.testng.Assert; @@ -246,14 +246,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { Arrays.asList(md5)); executeTest("test parallelization (single thread)", spec1); - GenomeAnalysisEngine.resetRandomGenerator(); + Utils.resetRandomGenerator(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 2", 1, Arrays.asList(md5)); executeTest("test parallelization (2 threads)", spec2); - GenomeAnalysisEngine.resetRandomGenerator(); + Utils.resetRandomGenerator(); WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 4", 1, @@ -310,7 +310,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000 " + "-A SnpEff", 1, - UserException.class); + Arrays.asList("037ce3364668ee6527fba80c4f4bff95")); executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java index b9a2f4fad..b97d108c7 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -102,7 +102,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("37594ce48695bf443c9251f70006f2f0")); + Arrays.asList("837847b512c8f60d7c572dd6a80239d8")); executeTest("test Multiple SNP alleles", spec); } @@ -118,7 +118,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("ad2be9f69ae8c6776b3bfba069735f50")); + Arrays.asList("0fc44ff26d3f913e7012b000a4de9682")); executeTest("test reverse trim", spec); } @@ -126,7 +126,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("1cc9c3e45e0296bb33042b409db18ca4")); + Arrays.asList("0dba3bc42c0eb43fea205d528739e9da")); executeTest("test mismatched PLs", spec); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/FixedAFCalculatorProviderUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/FixedAFCalculatorProviderUnitTest.java index 325fbf962..9fddd1722 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/FixedAFCalculatorProviderUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/FixedAFCalculatorProviderUnitTest.java @@ -54,7 +54,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; import org.broadinstitute.gatk.engine.arguments.GenotypeCalculationArgumentCollection; -import org.broadinstitute.gatk.engine.arguments.StandardCallerArgumentCollection; +import org.broadinstitute.gatk.tools.walkers.genotyper.StandardCallerArgumentCollection; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/AFPriorProviderUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/AFPriorProviderUnitTest.java index 629bcbbf9..366b90bb7 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/AFPriorProviderUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/AFPriorProviderUnitTest.java @@ -51,12 +51,12 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.tools.walkers.genotyper.AFPriorProvider; import org.broadinstitute.gatk.tools.walkers.genotyper.CustomAFPriorProvider; import org.broadinstitute.gatk.tools.walkers.genotyper.HeterozygosityAFPriorProvider; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.Utils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -77,7 +77,7 @@ public class AFPriorProviderUnitTest extends BaseTest { @Test(dataProvider="HeterozygosityProviderData") public void testHeterozygosityProvider(final double h, final int useCount, final int minPloidy, final int maxPloidy) { final double het = h / maxPloidy; - final Random rdn = GenomeAnalysisEngine.getRandomGenerator(); + final Random rdn = Utils.getRandomGenerator(); final int[] plodies = new int[useCount]; for (int i = 0; i < useCount; i++) plodies[i] = rdn.nextInt(maxPloidy - minPloidy + 1) + minPloidy; @@ -100,7 +100,7 @@ public class AFPriorProviderUnitTest extends BaseTest { @Test(dataProvider="CustomProviderData") public void testCustomProvider(final int ploidy) { final double[] priors = new double[ploidy]; - final Random rdn = GenomeAnalysisEngine.getRandomGenerator(); + final Random rdn = Utils.getRandomGenerator(); double remaining = 1; final List priorsList = new ArrayList(); for (int i = 0; i < priors.length; i++) { diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ActiveRegionTestDataSet.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ActiveRegionTestDataSet.java new file mode 100644 index 000000000..3a12ee99c --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ActiveRegionTestDataSet.java @@ -0,0 +1,587 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.haplotypecaller; + +import htsjdk.samtools.GATKBin; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import org.apache.commons.math.distribution.ExponentialDistribution; +import org.broadinstitute.gatk.tools.walkers.haplotypecaller.readthreading.ReadThreadingGraph; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.haplotype.Haplotype; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** +* Mock-up active region data used in testing. +* +* @author Valentin Ruano-Rubio <valentin@broadinstitute.org> +*/ +public class ActiveRegionTestDataSet { + + private final byte[] referenceBytes; + protected String reference; + protected String[] haplotypeCigars; + protected List haplotypeStrings; + protected String[] readCigars; + protected byte[] bq; + protected byte[] dq; + protected byte[] iq; + protected int kmerSize; + private List haplotypeList; + private List readList; + private AssemblyResultSet assemblyResultSet; + private Map readBySequence; + private String stringRepresentation; + private List> readEventOffsetList; + private GenomeLocParser genomeLocParser; + + /** Create a new active region data test set */ + public ActiveRegionTestDataSet(final int kmerSize, final String reference, final String[] haplotypes, + final String[] readCigars, final byte[] bq, final byte[] dq, final byte[] iq) { + this.reference = reference; + this.referenceBytes = reference.getBytes(); + this.haplotypeCigars = haplotypes; + this.readCigars = readCigars; + this.bq = bq; + this.dq = dq; + this.iq = iq; + this.kmerSize = kmerSize; + this.genomeLocParser = new GenomeLocParser(ArtificialSAMUtils.createArtificialSamHeader(1,1,reference.length()).getSequenceDictionary()); + } + + public String getReference() { + return reference; + } + + public String toString() { + if (stringRepresentation == null) + return super.toString(); + else return stringRepresentation; + } + + public AssemblyResultSet assemblyResultSet() { + if (assemblyResultSet == null) { + final ReadThreadingGraph rtg = new ReadThreadingGraph(kmerSize); + rtg.addSequence("anonymous", this.getReference().getBytes(), true); + for (final String haplotype : this.haplotypesStrings()) { + rtg.addSequence("anonymous", haplotype.getBytes(), false); + } + rtg.buildGraphIfNecessary(); + if (rtg.hasCycles()) + throw new RuntimeException("there is cycles in the reference with kmer size " + kmerSize + ". Don't use this size for the benchmark or change the reference"); + + List haplotypeList = this.haplotypeList(); + + assemblyResultSet = new AssemblyResultSet(); + final AssemblyResult ar = new AssemblyResult((haplotypeList.size() > 1 ? + AssemblyResult.Status.ASSEMBLED_SOME_VARIATION : AssemblyResult.Status.JUST_ASSEMBLED_REFERENCE),rtg.convertToSequenceGraph()); + ar.setThreadingGraph(rtg); + + for (final Haplotype h : haplotypeList) + assemblyResultSet.add(h, ar); + } + return assemblyResultSet; + } + + public List haplotypesStrings() { + if (haplotypeStrings != null) { + return haplotypeStrings; + } + final List result = new ArrayList<>(haplotypeCigars.length); + String reference = this.reference; + for (final String cigar : haplotypeCigars) { + if (cigar.matches("^Civar:.*$")) { + stringRepresentation = cigar.substring(6); + result.addAll(expandAllCombinations(cigar.substring(6),reference)); + } else if (cigar.matches("^.*\\d+.*$")) { + result.add(applyCigar(reference, cigar,0,true)); + } else { + result.add(cigar); + } + } + haplotypeStrings = result; + return result; + } + + private List expandAllCombinations(final String cigarString, final String reference) { + final Civar civar = Civar.fromCharSequence(cigarString); + final List unrolledCivars = civar.optionalizeAll().unroll(); + List result = new ArrayList<>(unrolledCivars.size()); + for (final Civar c : unrolledCivars) { + result.add(c.applyTo(reference)); + } + return result; + } + + private List expandAllHaplotypeCombinations(final String civarString, final String reference) { + final Civar civar = Civar.fromCharSequence(civarString); + final List unrolledCivars = civar.optionalizeAll().unroll(); + List result = new ArrayList<>(unrolledCivars.size()); + for (final Civar c : unrolledCivars) { + final String baseString = c.applyTo(reference); + final Haplotype haplotype = new Haplotype(baseString.getBytes(),baseString.equals(reference)); + haplotype.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,reference.length())); + try { + haplotype.setCigar(c.toCigar(reference.length())); + } catch (final RuntimeException ex) { + c.applyTo(reference); + c.toCigar(reference.length()); + throw new RuntimeException("" + c + " " + ex.getMessage(),ex); + } + result.add(haplotype); + } + return result; + } + + + public List haplotypeList() { + if (haplotypeList == null) { + + final List result = new ArrayList<>(haplotypeCigars.length); + final String reference = this.reference; + for (final String cigar : haplotypeCigars) { + if (cigar.matches("^Civar:.*$")) { + stringRepresentation = cigar.substring(6); + result.addAll(expandAllHaplotypeCombinations(cigar.substring(6), reference)); + } else if (cigar.matches("^.*\\d+.*$")) { + result.add(cigarToHaplotype(reference, cigar, 0, true)); + } else { + final Haplotype h = new Haplotype(cigar.getBytes()); + h.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,reference.length())); + result.add(h); + } + } + haplotypeList = result; + } + return haplotypeList; + } + + + protected SAMSequenceDictionary artificialSAMSequenceDictionary() { + return new SAMSequenceDictionary(Collections.singletonList(new SAMSequenceRecord("00",reference.length()))); + } + + protected SAMFileHeader artificialSAMFileHeader() { + return ArtificialSAMUtils.createArtificialSamHeader(artificialSAMSequenceDictionary()); + } + + public List readList() { + if (readList == null) { + final SAMFileHeader header = artificialSAMFileHeader(); + readList = new ArrayList<>(readCigars.length); + final List haplotypes = haplotypesStrings(); + int count = 0; + for (final String descr : readCigars) { + String sequence; + if (descr.matches("^\\d+:\\d+:.+$")) { + final String[] parts = descr.split(":"); + int allele = Integer.valueOf(parts[0]); + int offset = Integer.valueOf(parts[1]); + final String cigar = parts[2]; + final String base = allele == 0 ? reference : haplotypes.get(allele - 1); + sequence = applyCigar(base, cigar, offset, false); + final GATKSAMRecord samRecord = ArtificialSAMUtils.createArtificialRead(header, "read_" + count, 0, 1, sequence.getBytes(), Arrays.copyOf(bq, sequence.length())); + readList.add(new MyGATKSAMRecord(samRecord)); + } else if (descr.matches("^\\*:\\d+:\\d+$")) { + int readCount = Integer.valueOf(descr.split(":")[1]); + int readLength = Integer.valueOf(descr.split(":")[2]); + readList.addAll(generateSamRecords(haplotypes, readCount, readLength, header, count)); + } else { + sequence = descr; + final GATKSAMRecord samRecord = ArtificialSAMUtils.createArtificialRead(header, "read_" + count, 0, 1, sequence.getBytes(), Arrays.copyOf(bq, sequence.length())); + readList.add(new MyGATKSAMRecord(samRecord)); + } + count = readList.size(); + } + } + return readList; + } + + public List> readEventOffsetList() { + if (haplotypeCigars.length != 1 || !haplotypeCigars[0].startsWith("Civar:")) + throw new UnsupportedOperationException(); + if (readEventOffsetList == null) { + final Civar civar = Civar.fromCharSequence(haplotypeCigars[0].substring(6)); + final List unrolledCivars = civar.optionalizeAll().unroll(); + + readEventOffsetList = new ArrayList<>(readCigars.length); + int count = 0; + for (final String descr : readCigars) { + if (descr.matches("^\\d+:\\d+:.+$")) { + throw new UnsupportedOperationException(); + } else if (descr.matches("^\\*:\\d+:\\d+$")) { + int readCount = Integer.valueOf(descr.split(":")[1]); + int readLength = Integer.valueOf(descr.split(":")[2]); + readEventOffsetList.addAll(generateElementOffsetRecords(haplotypesStrings(), unrolledCivars, readCount, readLength, count)); + } else { + throw new UnsupportedOperationException(); + } + count = readEventOffsetList.size(); + } + readEventOffsetList = Collections.unmodifiableList(readEventOffsetList); + } + return readEventOffsetList; + } + + + + + @SuppressWarnings("unused") + public String cigarToSequence(final String cigar) { + String reference = this.reference; + return applyCigar(reference, cigar,0,true); + } + + @SuppressWarnings("unused") + public GATKSAMRecord readFromString(final String readSequence) { + if (readBySequence == null) { + final List readList = readList(); + readBySequence = new HashMap<>(readList.size()); + for (final GATKSAMRecord r : readList) + readBySequence.put(r.getReadString(),r); + } + return readBySequence.get(readSequence); + } + + public List unrolledCivars() { + if (haplotypeCigars.length != 1 || !haplotypeCigars[0].startsWith("Civar:")) + throw new UnsupportedOperationException(); + final Civar civar = Civar.fromCharSequence(haplotypeCigars[0].substring(6)); + return civar.optionalizeAll().unroll(); + } + + public void introduceErrors(final Random rnd) { + final List reads = readList(); + final ArrayList result = new ArrayList<>(reads.size()); + for (final GATKSAMRecord read : reads) { + result.add(new MyGATKSAMRecord(read,rnd)); + } + readList = result; + } + + private class MyGATKSAMRecord extends GATKSAMRecord { + protected MyGATKSAMRecord(final GATKSAMRecord r) { + super(r); + this.setMappingQuality(100); + GATKBin.setReadIndexingBin(this, -1); + } + + ExponentialDistribution indelLengthDist = MathUtils.exponentialDistribution(1.0 / 0.9); + + public MyGATKSAMRecord(final GATKSAMRecord r, final Random rnd) { + super(r); + this.setMappingQuality(100); + // setting read indexing bin last + + final byte[] bases = new byte[r.getReadBases().length]; + + final byte[] readBases = r.getReadBases(); + final byte[] bq = r.getBaseQualities(); + final byte[] iq = r.getBaseInsertionQualities(); + final byte[] dq = r.getBaseDeletionQualities(); + int refOffset = r.getAlignmentStart() - 1; + int readOffset = 0; + for (int i = 0; i < r.getReadBases().length;) { + double p = rnd.nextDouble(); + double iqp = QualityUtils.qualToErrorProb(iq[i]); + if (p < iqp) { // insertion + final int length = Math.min(generateIndelLength(rnd),r.getReadBases().length - i); + final int refStart = rnd.nextInt(reference.length() - length); + System.arraycopy(referenceBytes,refStart,bases,i,length); + i += length; + continue; + } + p -= iqp; + double dqp = QualityUtils.qualToErrorProb(dq[i]); + if (p < dqp) { + final int length = generateIndelLength(rnd); + refOffset += length; + refOffset = refOffset % referenceBytes.length; + readOffset += length; + continue; + } + p -= dqp; + double bqp = QualityUtils.qualToErrorProb(bq[i]); + byte b = readOffset < readBases.length ? readBases[readOffset] : referenceBytes[refOffset]; + byte nb; + if (p < bqp) { + switch (b) { + case 'A': nb = 'C'; break; + case 'T': nb = 'A'; break; + case 'C': nb = 'G'; break; + case 'G': nb = 'B'; break; + default: nb = 'A'; + } + } else + nb = b; + + bases[i++] = nb; + refOffset++; + refOffset = refOffset % referenceBytes.length; + readOffset++; + } + this.setReadBases(bases); + this.setBaseQualities(r.getBaseQualities()); + this.setReadName(r.getReadName()); + + GATKBin.setReadIndexingBin(this, -1); + } + + private int generateIndelLength(final Random rnd) { + final int length; + try { + length = (int) Math.round(indelLengthDist.inverseCumulativeProbability(rnd.nextDouble()) + 1); + } catch (Exception e) { + throw new RuntimeException(e); + } + return length; + } + + @Override + public byte[] getBaseDeletionQualities() { + return Arrays.copyOf(dq,getReadLength()); + } + + @Override + public byte[] getBaseInsertionQualities() { + return Arrays.copyOf(iq,getReadLength()); + } + + @Override + public int getMappingQuality() { + return 100; + } + + @Override + public int hashCode() { + return getReadName().hashCode(); + } + + @Override + public boolean equals(Object o) { + if (o instanceof GATKSAMRecord) { + return getReadName().equals(((GATKSAMRecord)o).getReadName()); + } else { + return false; + } + } + + public String toString() { + return super.toString() + " " + this.getReadString(); + } + } + + + public List readStrings() { + final List result = new ArrayList<>(readCigars.length); + final List haplotypes = haplotypesStrings(); + for (final String descr : readCigars) { + String sequence; + if (descr.matches("^\\d+:\\d+:.+$")) { + final String[] parts = descr.split(":"); + int allele = Integer.valueOf(parts[0]); + int offset = Integer.valueOf(parts[1]); + final String cigar = parts[2]; + final String base = allele == 0 ? reference : haplotypes.get(allele - 1); + sequence = applyCigar(base, cigar, offset, false); + result.add(sequence); + } else if (descr.matches("\\*:^\\d+:\\d+")) { + int readCount = Integer.valueOf(descr.split(":")[1]); + int readLength = Integer.valueOf(descr.split(":")[2]); + result.addAll(generateReads(haplotypes, readCount, readLength)); + } else { + sequence = descr; + result.add(sequence); + } + } + return result; + } + + private List generateReads(final List haplotypes, final int readCount, final int readLength) { + final List result = new ArrayList<>(readCount); + for (int i = 0; i < readCount; i++) { + int hi = i % haplotypes.size(); + final String h = haplotypes.get(hi); + int offset = i % h.length() - readLength; + result.add(h.substring(offset,offset + readLength)); + } + return result; + } + + private List generateSamRecords(final List haplotypes, final int readCount, final int readLength, final SAMFileHeader header, final int idStart) { + int id = idStart; + final List result = new ArrayList<>(readCount); + for (int i = 0; i < readCount; i++) { + int hi = i % haplotypes.size(); + final String h = haplotypes.get(hi); + int offset = h.length() <= readLength ? 0 : i % (h.length() - readLength); + int to = Math.min(h.length(),offset + readLength); + byte[] bases = h.substring(offset,to).getBytes(); + byte[] quals = Arrays.copyOf(bq,to - offset); + final GATKSAMRecord samRecord = ArtificialSAMUtils.createArtificialRead(header,"read_" + id++,0,offset + 1,bases, quals); + result.add(new MyGATKSAMRecord(samRecord)); + } + return result; + } + + + private List> generateElementOffsetRecords(final List haplotypes, final List unrolledCivars, final int readCount, final int readLength, final int count) { + + final List> result = new ArrayList<>(readCount); + for (int i = 0; i < readCount; i++) { + int hi = i % unrolledCivars.size(); + final Civar c = unrolledCivars.get(hi); + final String h = haplotypes.get(hi); + int offset = h.length() <= readLength ? 0 : i % (h.length() - readLength); + int to = Math.min(h.length(),offset + readLength); + result.add(c.eventOffsets(reference,offset,to)); + } + return result; + } + + private static final Pattern cigarPattern = Pattern.compile("(\\d+)([=A-Z])"); + + + private Haplotype cigarToHaplotype(final String reference, final String cigar, final int offset, final boolean global) { + final String sequence = applyCigar(reference,cigar,offset,global); + final Haplotype haplotype = new Haplotype(sequence.getBytes(),reference.equals(sequence)); + haplotype.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,reference.length())); + haplotype.setCigar(Civar.fromCharSequence(cigar).toCigar(reference.length())); + return haplotype; + } + + private String applyCigar(final String reference, final String cigar, final int offset, final boolean global) { + final Matcher pm = cigarPattern.matcher(cigar); + StringBuffer sb = new StringBuffer(); + int index = offset; + while (pm.find()) { + int length = Integer.valueOf(pm.group(1)); + char operator = pm.group(2).charAt(0); + switch (operator) { + case '=' : + try { + sb.append(reference.substring(index, index + length)); + } catch (Exception e) { + throw new RuntimeException(" " + index + " " + (index + length) + " " + reference.length() + " " + cigar,e); + } + index += length; break; + case 'D' : + index += length; break; + case 'I' : + String insert = cigar.substring(pm.end(),pm.end() + length).toUpperCase(); + sb.append(insert); break; + case 'V' : + sb.append(transversionV(reference.charAt(index))); index++; break; + case 'W' : + sb.append(transversionW(reference.charAt(index))); index++; break; + case 'T' : + sb.append(transition(reference.charAt(index))); index++; break; + default: + throw new UnsupportedOperationException("cigar operator " + operator + " not supported."); + } + } + if (global && index != reference.length()) { + throw new RuntimeException(" haplotype cigar does not explain reference length (" + index + " != " + reference.length() + ") on cigar " + cigar); + } else if (index > reference.length()) { + throw new RuntimeException(" index beyond end "); + } + return sb.toString(); + } + + protected int kmerSize() { + return kmerSize; + } + + private char transversionV(final char c) { + switch (Character.toUpperCase(c)) { + case 'A': return 'C'; + case 'G': return 'T'; + case 'C': return 'A'; + case 'T': return 'G'; + default: + return c; + } + + } + + private char transversionW(final char c) { + switch (Character.toUpperCase(c)) { + case 'A': return 'T'; + case 'G': return 'C'; + case 'T': return 'A'; + case 'C': return 'G'; + default: + return c; + } + + } + + private char transition(final char c) { + switch (Character.toUpperCase(c)) { + case 'A': return 'G'; + case 'G': return 'A'; + case 'T': return 'C'; + case 'C': return 'T'; + default: + return c; + } + + } +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ActiveRegionTestDataSetUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ActiveRegionTestDataSetUnitTest.java index 25aedb149..3f0cb94f3 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ActiveRegionTestDataSetUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ActiveRegionTestDataSetUnitTest.java @@ -55,7 +55,6 @@ import com.google.caliper.Param; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.haplotype.Haplotype; -import org.broadinstitute.gatk.utils.pairhmm.ActiveRegionTestDataSet; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.testng.Assert; import org.testng.annotations.DataProvider; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/AssemblyResultSetUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/AssemblyResultSetUnitTest.java index a8a90f37a..b1174e22a 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/AssemblyResultSetUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/AssemblyResultSetUnitTest.java @@ -60,7 +60,6 @@ import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.RandomDNA; import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; import org.broadinstitute.gatk.utils.haplotype.Haplotype; -import org.broadinstitute.gatk.utils.pairhmm.ActiveRegionTestDataSet; import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; import org.testng.Assert; import org.testng.annotations.BeforeClass; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/FastLoglessPairHMMUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/FastLoglessPairHMMUnitTest.java new file mode 100644 index 000000000..8773bbc63 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/FastLoglessPairHMMUnitTest.java @@ -0,0 +1,183 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.haplotypecaller; + +import org.broadinstitute.gatk.utils.haplotype.Haplotype; +import org.broadinstitute.gatk.utils.pairhmm.FastLoglessPairHMM; +import org.broadinstitute.gatk.utils.pairhmm.PairHMMReadyHaplotypes; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.*; + + +/** + * Created with IntelliJ IDEA. + * User: valentin + * Date: 10/13/13 + * Time: 12:55 PM + * To change this template use File | Settings | File Templates. + */ +public class FastLoglessPairHMMUnitTest extends ActiveRegionTestDataSetUnitTest { + + private FastLoglessPairHMM unsorted = new FastLoglessPairHMM((byte)10); + private FastLoglessPairHMM sorted = new FastLoglessPairHMM((byte)10); + + @Test(enabled=false,dataProvider="activeRegionTestDataSets") + public void testActiveRegionsDataSet(final ActiveRegionTestDataSet as, final int kmerSize, final int readLength, final String variation, final int readCount, final int regionSize, final byte bq, final byte iq, final byte dq) { + + } + + @Test(enabled=true,dataProvider="activeRegionTestDataSets") + public void testHaplotypeGrouped(final ActiveRegionTestDataSet as, final int kmerSize, final int readLength, final String variation, final int readCount, final int regionSize, final byte bq, final byte iq, final byte dq) { + final List reads = as.readList(); + final List haplotypes = as.haplotypeList(); + PairHMMReadyHaplotypes haplotypeCollection = new PairHMMReadyHaplotypes(haplotypes.size()); + final List sortedHaplotypes = new ArrayList<>(haplotypes); + Collections.sort(sortedHaplotypes, HAPLOTYPE_COMPARATOR); + Map basesToPos = new HashMap<>(sortedHaplotypes.size()); + int nextIdx = 0; + + for (final Haplotype h : sortedHaplotypes) { + final byte[] bases = h.getBases(); + haplotypeCollection.add(bases); + basesToPos.put(bases,nextIdx++); + } + for (GATKSAMRecord read : reads) { + final double[] unsortedLikelihoods = new double[sortedHaplotypes.size()]; + final double[] sortedLikelihoods = new double[sortedHaplotypes.size()]; + unsorted.loadRead(read); + sorted.loadRead(read); + final Map unsortedResults = new HashMap<>(haplotypes.size()); + for (int i = 0; i < sortedHaplotypes.size(); i++) { + final Haplotype h = sortedHaplotypes.get(i); + final byte[] haplotypeBases = h.getBases().clone(); + unsorted.loadHaplotypeBases(haplotypeBases); + double lk = unsorted.calculateLocalLikelihood(0, read.getReadLength(), 0, haplotypeBases.length, false); + unsortedLikelihoods[i] = lk; + } + sorted.calculateLocalLikelihoods(0, read.getReadLength(), haplotypeCollection); + for (final PairHMMReadyHaplotypes.Entry entry : haplotypeCollection) { + final byte[] bases = entry.getBases(); + final double lk = entry.getLikelihood(); + final int haplotypePos = basesToPos.get(bases); + sortedLikelihoods[haplotypePos] = lk; + } + for (int i = 0; i < unsortedLikelihoods.length; i++) + Assert.assertEquals(unsortedLikelihoods[i],sortedLikelihoods[i],0.00000001,Arrays.toString(unsortedLikelihoods) + Arrays.toString(sortedLikelihoods)); + } + } + + @Test(enabled=true,dataProvider="activeRegionTestDataSets") + public void testSortedVsUnsorted(final ActiveRegionTestDataSet as, final int kmerSize, final int readLength, final String variation, final int readCount, final int regionSize, final byte bq, final byte iq, final byte dq) { + final List reads = as.readList(); + final List haplotypes = as.haplotypeList(); + final List sortedHaplotypes = new ArrayList<>(haplotypes); + Collections.sort(sortedHaplotypes, HAPLOTYPE_COMPARATOR); + + byte[] lastHaplotypeBases = null; + for (GATKSAMRecord read : reads) { + final double[] unsortedLikelihoods = new double[sortedHaplotypes.size()]; + final double[] sortedLikelihoods = new double[sortedHaplotypes.size()]; + unsorted.loadRead(read); + sorted.loadRead(read); + for (int i = 0; i < sortedHaplotypes.size(); i++) { + final Haplotype h = sortedHaplotypes.get(i); + final byte[] haplotypeBases = h.getBases().clone(); + final byte[] haplotypeBases2 = haplotypeBases.clone(); + int commonPrefixEnd = 0; + + + if (lastHaplotypeBases != null) { + final int prefixEndLimit = Math.min(lastHaplotypeBases.length,haplotypeBases.length); + for (commonPrefixEnd = 0; commonPrefixEnd < prefixEndLimit; commonPrefixEnd++) + if (lastHaplotypeBases[commonPrefixEnd] != haplotypeBases[commonPrefixEnd]) + break; + } + + unsorted.loadHaplotypeBases(haplotypeBases); + sorted.changeHaplotypeSuffix(commonPrefixEnd, haplotypeBases, commonPrefixEnd, haplotypeBases.length); + Assert.assertTrue(Arrays.equals(haplotypeBases2, unsorted.getHaplotypeBases())); + Assert.assertTrue(Arrays.equals(haplotypeBases2, sorted.getHaplotypeBases())); + unsortedLikelihoods[i] = unsorted.calculateLocalLikelihood(0, read.getReadLength(), 0, haplotypeBases.length, false); + sortedLikelihoods[i] = sorted.calculateLocalLikelihood(0, read.getReadLength(), 0, haplotypeBases.length, false); + Assert.assertTrue(Arrays.equals(haplotypeBases2,unsorted.getHaplotypeBases())); + Assert.assertTrue(Arrays.equals(haplotypeBases2,sorted.getHaplotypeBases())); + Assert.assertEquals((double)unsortedLikelihoods[i], (double) sortedLikelihoods[i],0.00000001); + lastHaplotypeBases = haplotypeBases; + } + } + } + + public static final Comparator HAPLOTYPE_COMPARATOR = new Comparator() { + + @Override + public int compare(final Haplotype o1, final Haplotype o2) { + if (o1 == o2) + return 0; + final byte[] bases1 = o1.getBases(); + final byte[] bases2 = o2.getBases(); + final int ilimit = Math.min(bases1.length,bases2.length); + for (int i = 0; i < ilimit; i++) { + final int cmp = Byte.compare(bases1[i],bases2[i]); + if (cmp != 0) return cmp; + } + if (bases1.length == bases2.length) return 0; + return (bases1.length > bases2.length) ? -1 : 1; // is a bit better to get the longest haplotypes first. + } + }; + + + + +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java index e898829ad..c46a2a2de 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java @@ -53,8 +53,7 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import com.google.caliper.Param; import com.google.caliper.SimpleBenchmark; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleListUtils; -import org.broadinstitute.gatk.utils.pairhmm.ActiveRegionTestDataSet; +import org.broadinstitute.gatk.utils.genotyper.SampleListUtils; import org.broadinstitute.gatk.utils.pairhmm.FastLoglessPairHMM; import org.broadinstitute.gatk.utils.pairhmm.PairHMM; @@ -127,7 +126,7 @@ public class HCLikelihoodCalculationEnginesBenchmark extends SimpleBenchmark { public void timeLoglessPairHMM(final int reps) { for (int i = 0; i < reps; i++) { final PairHMMLikelihoodCalculationEngine engine = new PairHMMLikelihoodCalculationEngine((byte) 10, - PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, -3, true, PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.NONE); + PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, PairHMM.HMM_SUB_IMPLEMENTATION.UNVECTORIZED, true, -3, true, PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.NONE); engine.computeReadLikelihoods(dataSet.assemblyResultSet(), SampleListUtils.singletonList("anonymous"), Collections.singletonMap("anonymous", dataSet.readList())); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeBaseComparatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeBaseComparatorUnitTest.java new file mode 100644 index 000000000..dfd91ab24 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeBaseComparatorUnitTest.java @@ -0,0 +1,84 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.haplotypecaller; + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.haplotype.Haplotype; +import org.broadinstitute.gatk.utils.haplotype.HaplotypeBaseComparator; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class HaplotypeBaseComparatorUnitTest extends BaseTest { + @Test + public void testComparison() { + final List rawStrings = Arrays.asList("A", "C", "AC", "CT", "GTC", "ACGT"); + final List lexStrings = new ArrayList(rawStrings); + Collections.sort(lexStrings); + + for ( final List seqs : Utils.makePermutations(lexStrings, lexStrings.size(), false) ) { + final List haps = new ArrayList(seqs.size()); + for ( final String seq : seqs ) { + haps.add(new Haplotype(seq.getBytes(), false)); + } + + Collections.sort(haps, new HaplotypeBaseComparator()); + for ( int i = 0; i < lexStrings.size(); i++ ) + Assert.assertEquals(haps.get(i).getBaseString(), lexStrings.get(i), "Failed sort " + haps + " expected " + lexStrings); + } + } +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 21d1b53fb..136b656f2 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -61,19 +61,22 @@ import static org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCal public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends WalkerTest { + final static String HMM_SUB_IMPLEMENTATION = "UNVECTORIZED"; + final static String ALWAYS_LOAD_VECTOR_HMM = "-alwaysloadVectorHMM"; + private void HCTestComplexVariants(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4"; + final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4"; final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec); } @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "f4018f734d64f1f88b3ac4b712311567"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "070729585401dda47838911928ffbd2f"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1"; final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec); } @@ -81,11 +84,11 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa // TODO -- need a better symbolic allele test @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "845e795a345400a37c0dafdd5ce2f9ac"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "2bddd2bf5427142bf2235daa8589efee"); } private void HCTestComplexGGA(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerComplexGGA: args=" + args, spec); } @@ -93,17 +96,17 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "9de64c4405e0dab99c70c2fae54d4841"); + "64421f715e0258defc9efcfef56bdaab"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "e208f8ab7c90559757fdabc4fe6710f7"); + "d2306f6ecfcee9340423ba251e0736a3"); } private void HCTestComplexConsensusMode(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -consensus -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf -alleles " + validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, REF, bam) + " --no_cmdline_in_header -o %s -consensus -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf -alleles " + validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerComplexConsensusMode: args=" + args, spec); } @@ -111,7 +114,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleConsensusModeComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538 -L 20:133041-133161 -L 20:300207-300337", - "272e096b7dc2839d11343f35e5d5442d"); + "22c4135a87be18940ff622ea7ff9cabc"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index d00444202..42c8c6285 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -51,6 +51,7 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.engine.walkers.WalkerTest; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType; @@ -63,6 +64,9 @@ import java.util.List; public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { + final static String HMM_SUB_IMPLEMENTATION = "UNVECTORIZED"; + final static String ALWAYS_LOAD_VECTOR_HMM = "-alwaysloadVectorHMM"; + @DataProvider(name = "MyDataProviderHaploid") public Object[][] makeMyDataProviderHaploid() { List tests = new ArrayList<>(); @@ -71,12 +75,12 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "646ec07bd026da1e72b5e789f5aa3a3d"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "9acbee336e91cbfc1abeebd41bbcc9dd"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "e696ffc927af7f7a36dc7d49dad2c4f8"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "c223d6fe112d2bb698811600c3b7f6af"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "dacb94af2632e4dc4a1948306dd1661c"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "ef9093880efedac09b78c8fb26420e84"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "3e440b1b755a21d7bd3ecb093af8f43e"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "ebe078a1e209a5b231aeeba6deebcb8a"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "2ad9b5d87416c466292c2b97480e1f5c"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "6ca1e6cb78157273a4a96ba00e6d4713"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "8645c191ca5dbbae8dcb1389717f985a"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "9606db453f9e8beae27669afcea288a1"}); return tests.toArray(new Object[][]{}); } @@ -90,13 +94,13 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "0a982ba98be666d56452791df32109d7"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "5edcfa5ab96bc327783484c2bbe1c06f"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "b60c70fac56f129af78eaff9ad769557"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "1c3570461e96ad6d66c6abb0fd6ee865"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "66019a0914f905522da6bd3b557a57d1"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "92c7415dd1a5793161032d839b88fc28"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "b90d7c900ff4a8b5e58d6bd4ad64d750"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "8a1dcc091cb28e1fbbc86a1de85dfd4c"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "7e84e4562d8df6e593e58f017f697355"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "307ce5ada7c68e2f08664937bafa6281"}); - final String NA12878bandedResolutionMD5 = "de763f6c9a5aec4586a2671941e4c96d"; + final String NA12878bandedResolutionMD5 = "d51df38ad52cf2b0ecbce362e60fb24e"; tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, NA12878bandedResolutionMD5}); tests.add(new Object[]{NA12878_WEx + " -I " + privateTestDir + "NA20313.highCoverageRegion.bam -sn NA12878", ReferenceConfidenceMode.GVCF, WExIntervals, NA12878bandedResolutionMD5}); @@ -113,56 +117,87 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "7db552463cf779644335bfa09fcddf82"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "90600d209cf778fdfca6a844b8ee4acb"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "db3c99786c5726b20dbfe47e31e50d60"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "eac793500fbc7de46259000dbbcdd27d"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "4a431e0e387de3f791318f67d8855b0b"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "1058d3fe6553e07f002f994759c9647d"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "f441aab92b07591281fa44748b7bd71e"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "d54d7988552a13de90977ba06b094b74"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "c3c12414059390f3d6e3e533502c1869"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "91164cf1247f5b187ad133b280aa1fd2"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "b5c1b79550a8d8bb479895e2be38d945"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "79e913ab2ddf19b3cae75f2da9394239"}); return tests.toArray(new Object[][]{}); } + @DataProvider(name = "MyDataProviderManyploid") + public Object[][] makeMyDataProviderManyploid() { + List tests = new ArrayList<>(); + + final String PCRFreeIntervals = "-L 20:10,000,000-10,010,000"; + final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; + + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "081d943a092b7ad71d1f1fa6ec191ace"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "93f95c7a51741f8e527f1308ffd91052"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "1f2e5b5c06cb6d1196315c4308421f6d"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "3d8f4f849df0b9cddb1ec61279e91a83"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "1fbe1435b860400ecae3115141453c9a"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "fe9e1992cc28b798dd3ee66aaba726aa"}); + + return tests.toArray(new Object[][]{}); + } + + /** - * Example testng test using MyDataProvider + * Test HaplotypeCaller, using MyDataProvider */ @Test(dataProvider = "MyDataProvider") public void testHCWithGVCF(String bam, ReferenceConfidenceMode mode, String intervals, String md5) { - final String commandLine = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s %s -ERC %s --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", - b37KGReference, bam, intervals, mode, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final String commandLine = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s %s -ERC %s --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", + HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, bam, intervals, mode, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); final String name = "testHCWithGVCF bam=" + bam + " intervals= " + intervals + " gvcf= " + mode; final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5)); executeTest(name, spec); } /** - * Example testng test using MyDataProvider + * Test HaplotypeCaller with haploid samples, using MyDataProviderHaploid */ @Test(dataProvider = "MyDataProviderHaploid", enabled=true) public void testHCWithGVCFHaploid(final String bam, final ReferenceConfidenceMode mode, final String intervals, final String md5) { - final String commandLine = String.format("-T HaplotypeCaller -ploidy 1 --disableDithering --pcr_indel_model NONE -R %s -I %s %s -ERC %s --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", - b37KGReference, bam, intervals, mode, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final String commandLine = String.format("-T HaplotypeCaller -ploidy 1 --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s %s -ERC %s --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", + HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, bam, intervals, mode, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); final String name = "testHCWithGVCFHaploid bam=" + bam + " intervals= " + intervals + " gvcf= " + mode; final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5)); executeTest(name, spec); } /** - * Example testng test using MyDataProvider + * Test HaplotypeCaller with tetraploid samples, using MyDataProviderTetraploid */ @Test(dataProvider = "MyDataProviderTetraploid", enabled=true) public void testHCWithGVCFTetraploid(final String bam, final ReferenceConfidenceMode mode, final String intervals, final String md5) { - final String commandLine = String.format("-T HaplotypeCaller -ploidy 4 --disableDithering --pcr_indel_model NONE -R %s -I %s %s -ERC %s --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", - b37KGReference, bam, intervals, mode, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final String commandLine = String.format("-T HaplotypeCaller -ploidy 4 --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s %s -ERC %s --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", + HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, bam, intervals, mode, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); final String name = "testHCWithGVCFTetraploid bam=" + bam + " intervals= " + intervals + " gvcf= " + mode; final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5)); executeTest(name, spec); } + /** + * Test HaplotypeCaller with manyploid samples, using MyDataProviderManyploid + */ + @Test(dataProvider = "MyDataProviderManyploid", enabled=true) + public void testHCWithGVCFManyploid(final String bam, final ReferenceConfidenceMode mode, final String intervals, final String md5) { + final String commandLine = String.format("-T HaplotypeCaller -ploidy 33 --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s %s -ERC %s --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", + HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, bam, intervals, mode, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); + final String name = "testHCWithGVCFManyploid bam=" + bam + " intervals= " + intervals + " gvcf= " + mode; + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5)); + executeTest(name, spec); + } + @Test public void testERCRegionWithNoCalledHaplotypes() { - final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d", - b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d", + HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); spec.disableShadowBCF(); executeTest("testERCRegionWithNoCalledHaplotypes", spec); @@ -170,17 +205,29 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { @Test() public void testMissingGVCFIndexException() { - final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF", - b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001"); + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF", + HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001"); final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.GVCFIndexException.class); spec.disableShadowBCF(); executeTest("testMissingGVCFIndexingStrategyException", spec); } + /** + * Test HaplotypeCaller to ensure it does not throw an exception when a .g.vcf output file is specified and the indexing arguments are omitted + */ + @Test() + public void testGVCFIndexNoThrow() { + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF", + HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000000-17000100"); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(GATKVCFUtils.GVCF_EXT), Arrays.asList("")); + spec.disableShadowBCF(); + executeTest("testGVCFIndexNoThrow", spec); + } + @Test() public void testWrongParameterGVCFIndexException() { - final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d", - b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER + 1); + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d", + HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER + 1); final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.GVCFIndexException.class); spec.disableShadowBCF(); executeTest("testMissingGVCFIndexingStrategyException", spec); @@ -190,11 +237,11 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testWrongTypeGVCFIndexException() { // ensure non-optimal, if optimal changes GATKVCFIndexType type = GATKVCFIndexType.DYNAMIC_SEEK; - if (HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE == GATKVCFIndexType.DYNAMIC_SEEK) + if (GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE == GATKVCFIndexType.DYNAMIC_SEEK) type = GATKVCFIndexType.DYNAMIC_SIZE; - final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d", - b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", type, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d", + HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", type, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.GVCFIndexException.class); spec.disableShadowBCF(); executeTest("testMissingGVCFIndexingStrategyException", spec); @@ -205,9 +252,9 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { @Test() public void testWrongGVCFNonVariantRecordOrderBugFix() { - final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", - b37KGReference, WRONG_GVCF_RECORD_ORDER_BUGFIX_BAM, WRONG_GVCF_RECORD_ORDER_BUGFIX_INTERVALS, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("e6a4e571abb59b925d59a38d244f0abe")); + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", + HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, WRONG_GVCF_RECORD_ORDER_BUGFIX_BAM, WRONG_GVCF_RECORD_ORDER_BUGFIX_INTERVALS, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("7fa0578150ea8ef333cb141f78cf4a5a")); spec.disableShadowBCF(); executeTest("testMissingGVCFIndexingStrategyException", spec); } @@ -222,9 +269,9 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { @Test public void testNoCallGVCFMissingPLsBugFix() { - final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", - b37KGReference, NOCALL_GVCF_BUGFIX_BAM, NOCALL_GVCF_BUGFIX_INTERVALS, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("7ef1f30d92178f75e5220b16508b47cd")); + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", + HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, NOCALL_GVCF_BUGFIX_BAM, NOCALL_GVCF_BUGFIX_INTERVALS, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("befa4bf150099b3faf44130a6c9cdbb9")); spec.disableShadowBCF(); executeTest("testNoCallGVCFMissingPLsBugFix", spec); } @@ -234,8 +281,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { */ @Test(enabled=true) public void testGeneralPloidyArrayIndexBug1Fix() { - final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 1 -maxAltAlleles 2 -isr INTERSECTION -L 1:23696115-23696189", - b37KGReference, GENERAL_PLOIDY_BUGFIX1_BAM, GENERAL_PLOIDY_BUGFIX1_INTERVALS, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 1 -maxAltAlleles 2 -isr INTERSECTION -L 1:23696115-23696189", + HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, GENERAL_PLOIDY_BUGFIX1_BAM, GENERAL_PLOIDY_BUGFIX1_INTERVALS, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); spec.disableShadowBCF(); executeTest(" testGeneralPloidyArrayIndexBug1Fix", spec); @@ -246,8 +293,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { */ @Test(enabled=true) public void testGeneralPloidyArrayIndexBug2Fix() { - final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 2 -maxAltAlleles 2 -A DepthPerSampleHC -A StrandBiasBySample -L 1:38052860-38052937", - b37KGReference, GENERAL_PLOIDY_BUGFIX2_BAM, GENERAL_PLOIDY_BUGFIX2_INTERVALS, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 2 -maxAltAlleles 2 -A DepthPerSampleHC -A StrandBiasBySample -L 1:38052860-38052937", + HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, GENERAL_PLOIDY_BUGFIX2_BAM, GENERAL_PLOIDY_BUGFIX2_INTERVALS, GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); spec.disableShadowBCF(); executeTest(" testGeneralPloidyArrayIndexBug2Fix", spec); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java index 352ef867a..b9c005554 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java @@ -71,7 +71,7 @@ import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.smithwaterman.Parameters; import org.broadinstitute.gatk.utils.smithwaterman.SWPairwiseAlignment; -import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; @@ -230,7 +230,7 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest { } Allele altAllele = null; for (final Allele allele : updatedVc.getAlleles()) - if (allele.isSymbolic() && allele.getBaseString().equals(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE_NAME)) + if (allele.isSymbolic() && allele.getBaseString().equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE_NAME)) altAllele = allele; Assert.assertNotNull(altAllele); } @@ -521,8 +521,8 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest { int counter = 0; for ( final VariantContext call : actualPhasedCalls ) { for ( final Genotype g : call.getGenotypes() ) { - if ( g.hasExtendedAttribute(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_ID_KEY) ) { - uniqueGroups.add(g.getExtendedAttribute(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_ID_KEY).toString()); + if ( g.hasExtendedAttribute(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_ID_KEY) ) { + uniqueGroups.add(g.getExtendedAttribute(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_ID_KEY).toString()); counter++; } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 14dd1cf76..0b6bfd201 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -56,11 +56,14 @@ import htsjdk.tribble.readers.LineIterator; import htsjdk.tribble.readers.PositionalBufferedStream; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFCodec; +import htsjdk.variant.vcf.VCFConstants; +import org.apache.commons.io.FileUtils; import org.broadinstitute.gatk.engine.walkers.WalkerTest; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; +import org.broadinstitute.gatk.utils.exceptions.UserException; import org.testng.Assert; import org.testng.annotations.Test; @@ -81,107 +84,110 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final static String CEUTRIO_MT_TEST_BAM = privateTestDir + "CEUTrio.HiSeq.b37.MT.1_50.bam"; final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals"; final static String GGA_INTERVALS_FILE = privateTestDir + "haplotype-caller-reduced-test-interval.list"; + final static String HMM_SUB_IMPLEMENTATION = "UNVECTORIZED"; + final static String ALWAYS_LOAD_VECTOR_HMM = "-alwaysloadVectorHMM"; - private void HCTest(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE --maxReadsInRegionPerSample 1000 --minReadsPerAlignmentStart 5 --maxProbPropagationDistance 50 --activeProbabilityThreshold 0.002 -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3"; + private void HCTest(String bam, String args, String md5) throws IOException { + final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE --maxReadsInRegionPerSample 1000 --minReadsPerAlignmentStart 5 --maxProbPropagationDistance 50 --activeProbabilityThreshold 0.002 -pairHMMSub %s %s -R %s -I %s -L %s", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); - executeTest("testHaplotypeCaller: args=" + args, spec); + final File outputVCF = executeTest("testHaplotypeCaller: args=" + args, spec).getFirst().get(0); + Assert.assertFalse(FileUtils.readFileToString(outputVCF).contains(VCFConstants.MAPPING_QUALITY_ZERO_KEY)); } @Test - public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "5468f50b4ed198e6e9b05a67c3103f72"); + public void testHaplotypeCallerMultiSample() throws IOException { + HCTest(CEUTRIO_BAM, "", "e8a73b950d027239b780757d898c7334"); } @Test - public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "e85ada8486a4ed7231918187a100e1c3"); + public void testHaplotypeCallerSingleSample() throws IOException { + + HCTest(NA12878_BAM, "", "c741efeb6f3e412c4e707da3cabee621"); } @Test - public void testHaplotypeCallerMultiSampleHaploid() { - HCTest(CEUTRIO_BAM, - "-ploidy 1", "5046d3f77a56fcc4ccc8a216670effac"); + public void testHaplotypeCallerMultiSampleHaploid() throws IOException { + HCTest(CEUTRIO_BAM, "-ploidy 1", "5bfcfdea258a3dafa04a99dd2b000c87"); } @Test - public void testHaplotypeCallerSingleSampleHaploid() { - HCTest(NA12878_BAM, "-ploidy 1", "e5c8a8bfb4d9d522e610a2299a9b32ad"); + public void testHaplotypeCallerSingleSampleHaploid() throws IOException { + HCTest(NA12878_BAM, "-ploidy 1", "304d2ade384406342655fdfd445576a3"); } @Test - public void testHaplotypeCallerSingleSampleTetraploid() { - HCTest(NA12878_BAM, "-ploidy 4", "af711f92b33d3f42e87a719745d93a68"); + public void testHaplotypeCallerSingleSampleTetraploid() throws IOException { + HCTest(NA12878_BAM, "-ploidy 4", "1d7aee93f3f2e331fcfa8f765467c66c"); } @Test - public void testHaplotypeCallerMinBaseQuality() { - HCTest(NA12878_BAM, "-mbq 15", "e85ada8486a4ed7231918187a100e1c3"); + public void testHaplotypeCallerMinBaseQuality() throws IOException { + HCTest(NA12878_BAM, "-mbq 15", "c741efeb6f3e412c4e707da3cabee621"); } @Test - public void testHaplotypeCallerMinBaseQualityHaploid() { - HCTest(NA12878_BAM, "-mbq 15 -ploidy 1", "e5c8a8bfb4d9d522e610a2299a9b32ad"); + public void testHaplotypeCallerMinBaseQualityHaploid() throws IOException { + HCTest(NA12878_BAM, "-mbq 15 -ploidy 1", "304d2ade384406342655fdfd445576a3"); } @Test - public void testHaplotypeCallerMinBaseQualityTetraploid() { - HCTest(NA12878_BAM, "-mbq 15 -ploidy 4", "af711f92b33d3f42e87a719745d93a68"); + public void testHaplotypeCallerMinBaseQualityTetraploid() throws IOException { + HCTest(NA12878_BAM, "-mbq 15 -ploidy 4", "1d7aee93f3f2e331fcfa8f765467c66c"); } @Test - public void testHaplotypeCallerGraphBasedSingleSample() { - HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "129965453a3ad9a22aa241f8c4afcbbf"); + public void testHaplotypeCallerGraphBasedSingleSample() throws IOException { + HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "ffd2363d2f7afd694b8e9b23c51b0cea"); } @Test - public void testHaplotypeCallerGraphBasedMultiSampleHaploid() { - HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased -ploidy 1", "1425b46f3cd50040a1272c8775672fc0"); + public void testHaplotypeCallerGraphBasedMultiSampleHaploid() throws IOException { + HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased -ploidy 1", "ab1630552bcc0a46431b3f6b7bd50bb5"); } @Test - public void testHaplotypeCallerGraphBasedMultiSample() { - HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "987c0bb684fc03bcc46cb619e8269fe4"); + public void testHaplotypeCallerGraphBasedMultiSample() throws IOException { + HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "31a21023efaf6f030478e5542ec652fe"); } @Test - public void testHaplotypeCallerSingleSampleWithDbsnp() { - HCTest(NA12878_BAM, "-D " + b37dbSNP132, "0f9f45384669cde243731ca803fa3a0b"); + public void testHaplotypeCallerSingleSampleWithDbsnp() throws IOException { + HCTest(NA12878_BAM, "-D " + b37dbSNP132, "1c91ca0c8c04cbce1ace3e9884efd458"); } @Test - public void testHaplotypeCallerMultiSampleGGA() { + public void testHaplotypeCallerMultiSampleGGA() throws IOException { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf" + " -isr INTERSECTION -L " + GGA_INTERVALS_FILE, - "86a060e9514eaf90c14ddaa7e6d07259"); + "2944a830504b4e0b87bb8babc8ea39ae"); } @Test - public void testHaplotypeCallerMultiSampleGGAHaploid() { + public void testHaplotypeCallerMultiSampleGGAHaploid() throws IOException { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -ploidy 1 -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf -isr INTERSECTION -L 20:10080000-10100000", - "2d32509234571132ec1fc84ebbc0c48b"); + "b4da788ff173453d915a807149d9ab5d"); } @Test - public void testHaplotypeCallerMultiSampleGGATetraploid() { + public void testHaplotypeCallerMultiSampleGGATetraploid() throws IOException { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -ploidy 4 -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf -isr INTERSECTION -L 20:10080000-10100000", - "b47d6e7b99a8e3413fa94d83a9e760fa"); + "7a3a8a81c3f984d74e6e3e35f5e62aa3"); } @Test - public void testHaplotypeCallerInsertionOnEdgeOfContig() { - HCTest(CEUTRIO_MT_TEST_BAM, "-L MT:1-10", "455843da6331c282cb88943f25dafeec"); + public void testHaplotypeCallerInsertionOnEdgeOfContig() throws IOException { + HCTest(CEUTRIO_MT_TEST_BAM, "-L MT:1-10", "60e578f65ab2be60f31ee8395845607a"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerIndelQualityScores: args=" + args, spec); } @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "f2242d1696ef542196d363cf56159851"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "60df4797f86c1454c0eb76c5eaf2ad38"); } private void HCTestNearbySmallIntervals(String bam, String args, String md5) { @@ -189,7 +195,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference)); final GenomeLocParser parser = new GenomeLocParser(fasta.getSequenceDictionary()); - final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); for( final File vcf : executeTest("testHaplotypeCallerNearbySmallIntervals: args=" + args, spec).getFirst() ) { if( containsDuplicateRecord(vcf, parser) ) { @@ -218,7 +224,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerNearbySmallIntervals() { - HCTestNearbySmallIntervals(NA12878_BAM, "", "e0794be1404bffd88738968e26b48cbb"); + HCTestNearbySmallIntervals(NA12878_BAM, "", "7af25494bf2b05cc838ebf7055407c30"); } // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -227,29 +233,29 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { // any of the calls in that region because it is so messy. @Test public void HCTestProblematicReadsModifiedInActiveRegions() { - final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("cf6fbb3636c52cd47dd14e0bd415a320")); + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("3f01b3be2004f784a0fddc9e63aeba2a")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { - final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("a145c3c0e99b278054a9960923da8aaa")); + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("1ca9a141cb65c6070a93d5a2c55a9b3b")); executeTest("HCTestStructuralIndels: ", spec); } @Test public void HCTestDoesNotFailOnBadRefBase() { // don't care about the output - just want to make sure it doesn't fail - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "NA12878.readsOverBadBase.chr3.bam") + " --no_cmdline_in_header -o /dev/null -L 3:60830000-60840000 --minPruning 3 -stand_call_conf 2 -stand_emit_conf 2"; + final String base = String.format("-T HaplotypeCaller --disableDithering -pairHMMSub %s %s -R %s -I %s", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, REF, privateTestDir + "NA12878.readsOverBadBase.chr3.bam") + " --no_cmdline_in_header -o /dev/null -L 3:60830000-60840000 --minPruning 3 -stand_call_conf 2 -stand_emit_conf 2"; final WalkerTestSpec spec = new WalkerTestSpec(base, Collections.emptyList()); executeTest("HCTestDoesNotFailOnBadRefBase: ", spec); } @Test public void HCTestDanglingTailMergingForDeletions() throws IOException { - final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, NA12878_BAM) + " --no_cmdline_in_header -o %s -L 20:10130740-10130800 --allowNonUniqueKmersInRef"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, REF, NA12878_BAM) + " --no_cmdline_in_header -o %s -L 20:10130740-10130800 --allowNonUniqueKmersInRef"; final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); final File outputVCF = executeTest("HCTestDanglingTailMergingForDeletions", spec).getFirst().get(0); @@ -267,6 +273,19 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { } + private static final String LEFT_ALIGNMENT_BAMOUT_TEST_INPUT = privateTestDir + "/bamout-indel-left-align-bugfix-input.bam"; + + private static final String LEFT_ALIGNMENT_BAMOUT_TEST_OUTPUT = privateTestDir + "/bamout-indel-left-align-bugfix-expected-output.bam"; + + @Test + public void testLeftAlignmentBamOutBugFix() { + final String base = String.format("-T HaplotypeCaller -pairHMMSub %s %s -R %s -I %s", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, REF, LEFT_ALIGNMENT_BAMOUT_TEST_INPUT) + + " --no_cmdline_in_header -bamout %s -o /dev/null -L 1:11740000-11740700 --allowNonUniqueKmersInRef"; + final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("c19f0e62f90794661f5927c360d50998")); + executeTest("LeftAlignmentBamOutBugFix", spec); + } + + // -------------------------------------------------------------------------------------------------------------- // // test dbSNP annotation @@ -276,41 +295,41 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestDBSNPAnnotationWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,090,000-10,100,000 -D " + b37dbSNP132, 1, - Arrays.asList("86fb942473b3f8df2f8865209e551200")); + "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,090,000-10,100,000 -D " + b37dbSNP132, 1, + Arrays.asList("e894e9f50112edad270f36f78e76a8e3")); executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); } @Test public void HCTestDBSNPAnnotationWEx() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,100,000-11,000,000 -D " + b37dbSNP132 + "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,100,000-11,000,000 -D " + b37dbSNP132 + " -L " + hg19Intervals + " -isr INTERSECTION", 1, - Arrays.asList("b279dfa0864495d0cbe45167c13d5a75")); + Arrays.asList("9e384f2bd2eb7a6d5ee1685ab5e75501")); executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); } @Test public void HCTestDBSNPAnnotationWGSGraphBased() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,090,000-10,100,000 -D " + b37dbSNP132, 1, - Arrays.asList("b6dab8a6223afeb9d0fa7c178c84c024")); + "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,090,000-10,100,000 -D " + b37dbSNP132, 1, + Arrays.asList("801a3af44153deee939370dcaaa110ab")); executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); } @Test public void HCTestDBSNPAnnotationWExGraphBased() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 + "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 + " -L " + hg19Intervals + " -isr INTERSECTION", 1, - Arrays.asList("e46d99fb778e2aff09ccbbc1bda2a1bf")); + Arrays.asList("efac9fd7e7a92e3f130e7db9cbff4a45")); executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); } @Test public void HCTestGraphBasedPCRFreePositiveLogLkFix() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + hg19Reference + " --no_cmdline_in_header -I " + NA12878_PCRFREE250_ADAPTER_TRIMMED + " -o %s -L 20:10,024,000-10,024,500 " + "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + hg19Reference + " --no_cmdline_in_header -I " + NA12878_PCRFREE250_ADAPTER_TRIMMED + " -o %s -L 20:10,024,000-10,024,500 " , 1, Arrays.asList("")); executeTest("HCTestGraphBasedPCRFreePositiveLogLkFix", spec); @@ -325,25 +344,25 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestAggressivePcrIndelModelWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,270,000-10,300,000", 1, - Arrays.asList("163a42e1144be1dc905233a8a42b72f6")); + "-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,270,000-10,300,000", 1, + Arrays.asList("c851be534595a2547a8ebf81f1b923d1")); executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec); } @Test public void HCTestConservativePcrIndelModelWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,270,000-10,300,000", 1, - Arrays.asList("13298981348351e79a2ff5407f206c1d")); + "-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,270,000-10,300,000", 1, + Arrays.asList("95a3f339a15b1398cfc9f9e933999ea9")); executeTest("HC calling with conservative indel error modeling on WGS intervals", spec); } @Test public void testNoSuchEdgeBugFix() { - final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -dontTrimActiveRegions -ERC GVCF " + + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -dontTrimActiveRegions -ERC GVCF " + "-likelihoodEngine GraphBased -variant_index_type %s -variant_index_parameter %d", - b37KGReferenceWithDecoy, privateTestDir + "graphbased_no_such_edge_bug.bam", privateTestDir + "graphbased_no_such_edge_bug.intervals.bed", - HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReferenceWithDecoy, privateTestDir + "graphbased_no_such_edge_bug.bam", privateTestDir + "graphbased_no_such_edge_bug.intervals.bed", + GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); spec.disableShadowBCF(); executeTest("testGraphBasedNoSuchEdgeBugFix", spec); @@ -352,18 +371,18 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { // This test takes longer than 15 secs ... ~ 25-35 , @Test public void testLackSensitivityDueToBadHaplotypeSelectionFix() { - final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s --no_cmdline_in_header --maxNumHaplotypesInPopulation 16", - b37KGReferenceWithDecoy, privateTestDir + "hc-lack-sensitivity.bam", privateTestDir + "hc-lack-sensitivity.interval_list"); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("ae2d947d3ba3b139cc99efa877c4785c")); + final String commandLine = String.format("-T HaplotypeCaller -pairHMMSub %s %s -R %s -I %s -L %s --no_cmdline_in_header --maxNumHaplotypesInPopulation 16", + HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReferenceWithDecoy, privateTestDir + "hc-lack-sensitivity.bam", privateTestDir + "hc-lack-sensitivity.interval_list"); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("e6bc4d979ae90c35809a2030ad709b5e")); spec.disableShadowBCF(); executeTest("testLackSensitivityDueToBadHaplotypeSelectionFix", spec); } @Test public void testMissingKeyAlternativeHaplotypesBugFix() { - final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s --no_cmdline_in_header ", - b37KGReferenceWithDecoy, privateTestDir + "lost-alt-key-hap.bam", privateTestDir + "lost-alt-key-hap.interval_list"); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("423b70c151a5d0028e104aa3372b8783")); + final String commandLine = String.format("-T HaplotypeCaller -pairHMMSub %s %s -R %s -I %s -L %s --no_cmdline_in_header ", + HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReferenceWithDecoy, privateTestDir + "lost-alt-key-hap.bam", privateTestDir + "lost-alt-key-hap.interval_list"); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("e8ef8b17a7561dd056805c15baec285e")); spec.disableShadowBCF(); executeTest("testMissingKeyAlternativeHaplotypesBugFix", spec); } @@ -375,9 +394,9 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final String TEST_BAM = privateTestDir + "sw_epsilon_test.bam"; final String REFERENCE = b37KGReference; final String DBSNP = b37dbSNP138; - final String commandLineWithoutInterval = String.format("-T HaplotypeCaller -I %s -R %s -D %s " + final String commandLineWithoutInterval = String.format("-T HaplotypeCaller -pairHMMSub %s %s -I %s -R %s -D %s " + "-variant_index_type LINEAR -variant_index_parameter 128000 --no_cmdline_in_header " - + "-stand_call_conf 10.0 -stand_emit_conf 10.0",TEST_BAM,REFERENCE,DBSNP); + + "-stand_call_conf 10.0 -stand_emit_conf 10.0", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, TEST_BAM, REFERENCE, DBSNP); final String commandLineShortInterval = commandLineWithoutInterval + " -L " + SHORT_INTERVAL; final String commandLineLongInterval = commandLineWithoutInterval + " -L " + LONG_INTERVAL; @@ -386,11 +405,61 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { // but please make sure that both outputs get the same variant, // alleles all with DBSNP ids // We test here that change in active region size does not have an effect in placement of indels. - final String md5 = "8f572cfc1da9bdfc5e5725e092298ee6"; + final String md5 = "df27ceb13e6cda2c97cacd23608c2f7f"; final WalkerTestSpec shortSpec = new WalkerTestSpec(commandLineShortInterval + " -o %s",Arrays.asList(md5)); executeTest("testDifferentIndelLocationsDueToSWExactDoubleComparisonsFix::shortInterval",shortSpec); final WalkerTestSpec longSpec = new WalkerTestSpec(commandLineLongInterval + " -o %s",Arrays.asList(md5)); executeTest("testDifferentIndelLocationsDueToSWExactDoubleComparisonsFix::longInterval",longSpec); } + @Test + public void testHaplotypeCallerPairHMMException(){ + executeTest("HaplotypeCallerPairHMMException", + new WalkerTest.WalkerTestSpec( + " -T HaplotypeCaller" + + " --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE --maxReadsInRegionPerSample 1000 " + + " --minReadsPerAlignmentStart 5 --maxProbPropagationDistance 50 --activeProbabilityThreshold 0.002 " + + " --no_cmdline_in_header -minPruning 3 -pairHMM VECTOR_LOGLESS_CACHING -pairHMMSub TEST_BEYOND_CAPABILITIES " + + ALWAYS_LOAD_VECTOR_HMM + + " -R " + REF + + " -I " + NA12878_BAM + + " -L " + INTERVALS_FILE + + " -o %s", + 1, UserException.HardwareFeatureException.class)); + } + + @Test + public void testHaplotypeCallerDcovException(){ + executeTest("HaplotypeCallerDcovException", + new WalkerTest.WalkerTestSpec( + " -T HaplotypeCaller" + + " --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE --maxReadsInRegionPerSample 1000 " + + " --minReadsPerAlignmentStart 5 --maxProbPropagationDistance 50 --activeProbabilityThreshold 0.002 " + + " --no_cmdline_in_header -minPruning 3 -pairHMM VECTOR_LOGLESS_CACHING -pairHMMSub " + HMM_SUB_IMPLEMENTATION + + " -dcov 50" + + " -R " + REF + + " -I " + NA12878_BAM + + " -L " + INTERVALS_FILE + + " -o %s", + 1, UserException.CommandLineException.class)); + } + + @Test + public void testHaplotypeCallerMergeVariantsViaLDException(){ + executeTest("HaplotypeCallerMergeVariantsViaLDException", + new WalkerTest.WalkerTestSpec( + " -T HaplotypeCaller" + + " -R " + REF + + " -I " + NA12878_BAM + + " -L " + INTERVALS_FILE + + " --mergeVariantsViaLD " + + " -o %s", + 1, UserException.DeprecatedArgument.class)); + } + + @Test + public void testHaplotypeCallerTandemRepeatAnnotator() throws IOException{ + HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A TandemRepeatAnnotator -XA MappingQualityZero -XA SpanningDeletions", "481787c9275ab9f2e2b53025805472b7"); + } } + diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerModesIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerModesIntegrationTest.java index 7bf8b8a5a..f5894b84f 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerModesIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerModesIntegrationTest.java @@ -70,6 +70,9 @@ public class HaplotypeCallerModesIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- + final static String HMM_SUB_IMPLEMENTATION = "UNVECTORIZED"; + final static String ALWAYS_LOAD_VECTOR_HMM = "-alwaysloadVectorHMM"; + @Test public void HCTestBamWriterCalledHaplotypes() { HCTestBamWriter(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, ""); // current MD5 = 9a2b6157f14b44b872a77f4e75c56023 @@ -82,7 +85,7 @@ public class HaplotypeCallerModesIntegrationTest extends WalkerTest { public void HCTestBamWriter(final HaplotypeBAMWriter.Type type, final String md5) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam -o /dev/null " + + "-T HaplotypeCaller -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam -o /dev/null " + "-bamout %s -L 20:10,000,000-10,010,000 -bamWriterType " + type, 1, Arrays.asList(md5)); executeTest("HC writing bams with mode " + type, spec); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java index d01a66801..ed3413364 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java @@ -60,6 +60,10 @@ import java.util.Arrays; import java.util.List; public class HaplotypeCallerParallelIntegrationTest extends WalkerTest { + + final static String HMM_SUB_IMPLEMENTATION = "UNVECTORIZED"; + final static String ALWAYS_LOAD_VECTOR_HMM = "-alwaysloadVectorHMM"; + @DataProvider(name = "NCTDataProvider") public Object[][] makeNCTDataProvider() { List tests = new ArrayList<>(); @@ -74,7 +78,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest { @Test(dataProvider = "NCTDataProvider") public void testHCNCT(final int nct, final String md5) { WalkerTestSpec spec = new WalkerTestSpec( - "-T HaplotypeCaller --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + "-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam -o %s " + " -L 20:10,000,000-10,100,000 -G none -A none -contamination 0.0 -nct " + nct, 1, Arrays.asList(md5)); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeLDCalculatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeLDCalculatorUnitTest.java new file mode 100644 index 000000000..2ef8b7332 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeLDCalculatorUnitTest.java @@ -0,0 +1,123 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.haplotypecaller; + +import org.broadinstitute.gatk.utils.BaseTest; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +public class HaplotypeLDCalculatorUnitTest extends BaseTest { + HaplotypeLDCalculator calculator; + + @BeforeMethod + public void setUp() throws Exception { + calculator = new HaplotypeLDCalculator(); + } + + /** + * Tests that we get the right values from the R^2 calculation + */ + @Test + public void computeProbOfBeingPhased() { + logger.warn("Executing testCalculateR2LD"); + + // See AA, AB, and BA in population + Assert.assertEquals(calculator.pPhasedTest(0, 0, 0, -100), 0, 0.00001); + + // See AA, AB, BB in population + Assert.assertTrue(calculator.pPhasedTest(0, 0, -100, 0) < 0.5); + + // See AA and BB in population + Assert.assertEquals(calculator.pPhasedTest(0, -100, -100, 0), 1, 0.00001); + + // See AA, AB, and BA but no BBs in population + Assert.assertEquals(calculator.pPhasedTest(0, -20, -40, Double.NEGATIVE_INFINITY), 0, 0.00001); + + // See BB, AB, and BA but no AAs in population, so BB is the best explanation + Assert.assertEquals(calculator.pPhasedTest(Double.NEGATIVE_INFINITY, -20, -40, 0), 1, 0.00001); + + // See only AB and BA but no AAs nor BBs in population + Assert.assertEquals(calculator.pPhasedTest(Double.NEGATIVE_INFINITY, -20, -40, Double.NEGATIVE_INFINITY), 0, 0.00001); + + // Previously bad input + Assert.assertEquals(calculator.pPhasedTest(-400, -600, -1200, Double.NEGATIVE_INFINITY), 0, 0.00001); + + // first variant is just bad, so BA and BB are both very bad, shouldn't be phased + Assert.assertEquals(calculator.pPhasedTest(0, -1000, -100, -10000), 0, 0.00001); + + // second variant is just bad, so AB and BB are both very bad, shouldn't be phased + Assert.assertEquals(calculator.pPhasedTest(0, -100, -1000, -10000), 0, 0.00001); + + // AA is very good, all all others are quite poor. Shouldn't be phased + Assert.assertEquals(calculator.pPhasedTest(0, -1000, -1000, -10000), 0, 0.00001); + + + for ( int i = -10; i > -10000; i -= 10 ) { + // only bad het states + Assert.assertTrue(calculator.pPhasedTest(0, i, i, 0) > 0.99, "Failed for " + i); + + // BB state is terrible + Assert.assertTrue(calculator.pPhasedTest(0, 0, 0, i) < 0.5, "Failed for " + i); + + // truth is AB, BA, and BB + Assert.assertTrue(calculator.pPhasedTest(i, 0, 0, 0) < 0.5, "Failed for " + i); + + // truth is AB, BA + Assert.assertTrue(calculator.pPhasedTest(i, 0, 0, i) < 0.5, "Failed for " + i); + + // Only good signal is AB, so we shouldn't be phased + Assert.assertTrue(calculator.pPhasedTest(i, i, 0, i) < 0.5, "Failed for " + i); + Assert.assertTrue(calculator.pPhasedTest(i, 0, i, i) < 0.5, "Failed for " + i); + } + } +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeScoreComparatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeScoreComparatorUnitTest.java new file mode 100644 index 000000000..b137c3c20 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeScoreComparatorUnitTest.java @@ -0,0 +1,83 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.haplotypecaller; + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.haplotype.Haplotype; +import org.broadinstitute.gatk.utils.haplotype.HaplotypeScoreComparator; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class HaplotypeScoreComparatorUnitTest extends BaseTest { + @Test + public void testComparison() { + final List scores = Arrays.asList(3.0, 2.0, 1.0); + for ( final List myScores : Utils.makePermutations(scores, scores.size(), false) ) { + final List haps = new ArrayList(myScores.size()); + for ( final double score : myScores ) { + final Haplotype h = new Haplotype("ACT".getBytes(), false); + h.setScore(score); + haps.add(h); + } + + Collections.sort(haps, new HaplotypeScoreComparator()); + for ( int i = 0; i < myScores.size(); i++ ) + Assert.assertEquals(haps.get(i).getScore(), scores.get(i)); + } + } +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeSizeAndBaseComparatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeSizeAndBaseComparatorUnitTest.java new file mode 100644 index 000000000..593b3a833 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeSizeAndBaseComparatorUnitTest.java @@ -0,0 +1,89 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.haplotypecaller; + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.haplotype.Haplotype; +import org.broadinstitute.gatk.utils.haplotype.HaplotypeSizeAndBaseComparator; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +/** + * User: btaylor + * Date: 8/1/13 + * Time: 11:09 AM + */ +public class HaplotypeSizeAndBaseComparatorUnitTest extends BaseTest { + @Test + public void testComparison() { + // desired ordering is by size first, subordered by lexacographic relationship between bases + final List rawStrings = Arrays.asList("A", "C", "AC", "CC", "CT", "AAT", "ACT", "GAT", "ACGT"); + final List lexStrings = new ArrayList<>(rawStrings); + + for ( final List seqs : Utils.makePermutations(lexStrings, lexStrings.size(), false) ) { + final List haps = new ArrayList<>(seqs.size()); + for ( final String seq : seqs ) { + haps.add(new Haplotype(seq.getBytes(), false)); + } + + Collections.sort(haps, new HaplotypeSizeAndBaseComparator()); + for ( int i = 0; i < lexStrings.size(); i++ ) + Assert.assertEquals(haps.get(i).getBaseString(), lexStrings.get(i), "Failed sort " + haps + " expected " + lexStrings); + } + } +} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/LDMergerUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/LDMergerUnitTest.java new file mode 100644 index 000000000..826666d43 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/LDMergerUnitTest.java @@ -0,0 +1,341 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.haplotypecaller; + +import htsjdk.samtools.TextCigarCodec; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.*; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import org.broadinstitute.gatk.utils.haplotype.EventMap; +import org.broadinstitute.gatk.utils.haplotype.Haplotype; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.TreeSet; + +public class LDMergerUnitTest extends BaseTest { + LDMerger merger; + GenomeLocParser genomeLocParser; + + @BeforeClass + public void init() throws FileNotFoundException { + genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(b37KGReference))); + } + + @BeforeMethod + public void setUp() throws Exception { + merger = new LDMerger(); + } + + @Test + public void testCreateMergedVariantContext() { + logger.warn("Executing testCreateMergedVariantContext"); + + final byte[] ref = "AATTCCGGAATTCCGGAATT".getBytes(); + final GenomeLoc refLoc = genomeLocParser.createGenomeLoc("2", 1700, 1700 + ref.length); + + // SNP + SNP = simple MNP + VariantContext thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); + VariantContext nextVC = new VariantContextBuilder().loc("2", 1704, 1704).alleles("C","G").make(); + VariantContext truthVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","GG").source("merged").make(); + VariantContext mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // SNP + ref + SNP = MNP with ref base gap + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","GCG").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // insertion + SNP + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TAAAAA").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TAAAAACG").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // SNP + insertion + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","CAAAAA").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","GCCAAAAA").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // deletion + SNP + thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","T").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TG").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // SNP + deletion + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","GCC").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // insertion + deletion = MNP + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TA").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); + truthVC = new VariantContextBuilder().loc("2", 1704, 1706).alleles("CCG","ACC").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // insertion + deletion + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TAAAAA").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","TAAAAACC").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // insertion + insertion + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TA").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","CA").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TACCA").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // deletion + deletion + thisVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","A").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); + truthVC = new VariantContextBuilder().loc("2", 1701, 1706).alleles("ATTCCG","ATCC").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // deletion + insertion (abutting) + thisVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","A").make(); + nextVC = new VariantContextBuilder().loc("2", 1702, 1702).alleles("T","GCGCGC").make(); + truthVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","AGCGCGC").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // complex + complex + thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","AAA").make(); + nextVC = new VariantContextBuilder().loc("2", 1706, 1707).alleles("GG","AC").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1707).alleles("TCCGG","AAACAC").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + } + + @Test + public void testInsertionDeletionBecomingNullAllele() { + final byte[] ref = "CAAA".getBytes(); + final GenomeLoc refLoc = genomeLocParser.createGenomeLoc("2", 1700, 1700 + ref.length); + + // insertion + deletion results in a null allele, should return false + final VariantContext thisVC = new VariantContextBuilder().loc("2", 1700, 1701).alleles("CA","C").make(); + final VariantContext nextVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("A","AA").make(); + final VariantContext mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + Assert.assertNull(mergedVC, "Insertion deletion becoming a null allele should return a null variant context"); + } + + /** + * Just returns a given R2 value for testing + */ + private static class MockLDCalculator extends HaplotypeLDCalculator { + private final double R2; + + private MockLDCalculator(double r2) { + R2 = r2; + } + + @Override + protected double computeProbOfBeingPhased(VariantContext first, VariantContext second) { + return R2; + } + } + + @DataProvider(name = "R2MergerData") + public Object[][] makeR2MergerData() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + final double thres = LDMerger.MERGE_EVENTS_PROB_PHASED_THRESHOLD; + for ( final double r2 : Arrays.asList(0.0, thres - 0.01, thres + 0.01, 1.0) ) { + tests.add(new Object[]{"ACGT", "CCGC", 2, "4M", "ACGT", "CCGC", r2, r2 >= thres}); + tests.add(new Object[]{"ACGT", "AGGC", 2, "4M", "CGT", "GGC", r2, r2 >= thres}); + tests.add(new Object[]{"ACGT", "ACCC", 2, "4M", "GT", "CC", r2, r2 >= thres}); + tests.add(new Object[]{"ACGT", "ACCGTT", 2, "2M1I1M1I1M", "CG", "CCGT", r2, r2 >= thres}); + tests.add(new Object[]{"ACGT", "AGCT", 2, "4M", "CG", "GC", r2, r2 >= thres}); + tests.add(new Object[]{"ACAGT", "AAGC", 2, "1M1D3M", "ACAGT", "AAGC", r2, r2 >= thres}); + tests.add(new Object[]{"ACAGT", "AAT", 2, "1M1D1M1D1M", "ACAG", "AA", r2, r2 >= thres}); + + // cannot be merged -- only 1 event + tests.add(new Object[]{"AAA", "ACA", 1, "3M", null, null, r2, false}); + + final int dist = LDMerger.MAX_DISTANCE_BETWEEN_SNPS_TO_MERGE + 2; + tests.add(new Object[]{Utils.dupString("A", dist), "C" + Utils.dupString("A", dist - 2) + "C", 2, dist + "M", null, null, r2, false}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "R2MergerData") + public void testR2Merger(final String refS, final String hapS, int nEvents, final String cigar, final String expectedMergedRef, final String expectedMergedAlt, final double r2, final boolean expectMerge) { + final Haplotype ref = new Haplotype(refS.getBytes(), true, 0, TextCigarCodec.decode(refS.length() + "M")); + final Haplotype hap = new Haplotype(hapS.getBytes(), false, 0, TextCigarCodec.decode(cigar)); + final GenomeLoc loc = new UnvalidatingGenomeLoc("1", 0, 1, ref.length()); + + final List haplotypes = Arrays.asList(ref, hap); + final TreeSet vcStarts = EventMap.buildEventMapsForHaplotypes(haplotypes, ref.getBases(), loc, false); + final MockLDCalculator r2Calc = new MockLDCalculator(r2); + + Assert.assertEquals(vcStarts.size(), nEvents); + final boolean merged = merger.mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calc, 1, vcStarts, ref.getBases(), loc); + Assert.assertEquals(merged, expectMerge); + Assert.assertEquals(vcStarts.size(), expectMerge ? 1 : nEvents); + if ( expectMerge ) { + final VariantContext vc = hap.getEventMap().getVariantContexts().iterator().next(); + Assert.assertTrue(vc.isBiallelic()); + Assert.assertEquals(vc.getReference().getDisplayString(), expectedMergedRef); + Assert.assertEquals(vc.getAlternateAllele(0).getDisplayString(), expectedMergedAlt); + } + } + + @Test + public void testR2MergerWithThirdHapWithoutEvent() { + final String refS = "ACGT"; + final String hapS = "CCGA"; + final String cigar = "4M"; + final Haplotype ref = new Haplotype(refS.getBytes(), true, 0, TextCigarCodec.decode(refS.length() + "M")); + final Haplotype hap1 = new Haplotype(hapS.getBytes(), false, 0, TextCigarCodec.decode(cigar)); + final Haplotype hap2 = new Haplotype("ACGA".getBytes(), false, 0, TextCigarCodec.decode(cigar)); + final GenomeLoc loc = new UnvalidatingGenomeLoc("1", 0, 1, ref.length()); + + final List haplotypes = Arrays.asList(ref, hap1, hap2); + final TreeSet vcStarts = EventMap.buildEventMapsForHaplotypes(haplotypes, ref.getBases(), loc, false); + final MockLDCalculator r2Calc = new MockLDCalculator(1.0); + + Assert.assertEquals(vcStarts.size(), 2); + final boolean merged = merger.mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calc, 1, vcStarts, ref.getBases(), loc); + Assert.assertEquals(merged, true); + Assert.assertEquals(vcStarts.size(), 1); + + final VariantContext vc = hap1.getEventMap().getVariantContexts().iterator().next(); + Assert.assertTrue(vc.isBiallelic()); + Assert.assertEquals(vc.getReference().getDisplayString(), "ACGT"); + Assert.assertEquals(vc.getAlternateAllele(0).getDisplayString(), "CCGA"); + + Assert.assertEquals(hap2.getEventMap().size(), 0); + } + + @Test + public void testR2MergerWithMultipleAllelesAtSites() { + final String refS = "ACGT"; + final String hapS = "TCGA"; + final String cigar = "4M"; + final Haplotype ref = new Haplotype(refS.getBytes(), true, 0, TextCigarCodec.decode(refS.length() + "M")); + final Haplotype hap1 = new Haplotype(hapS.getBytes(), false, 0, TextCigarCodec.decode(cigar)); + + final GenomeLoc loc = new UnvalidatingGenomeLoc("1", 0, 1, ref.length()); + for (final String hap2S : Arrays.asList("GCGA", "TCGG")) { + final Haplotype hap2 = new Haplotype(hap2S.getBytes(), false, 0, TextCigarCodec.decode(cigar)); + + final List haplotypes = Arrays.asList(ref, hap1, hap2); + final TreeSet vcStarts = EventMap.buildEventMapsForHaplotypes(haplotypes, ref.getBases(), loc, false); + final MockLDCalculator r2Calc = new MockLDCalculator(1.0); + + Assert.assertEquals(vcStarts.size(), 2); + final boolean merged = merger.mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calc, 1, vcStarts, ref.getBases(), loc); + Assert.assertEquals(merged, false); + Assert.assertEquals(vcStarts.size(), 2); + } + } +} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java index cb1f31a84..40e3aa30e 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java @@ -61,8 +61,8 @@ import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.pairhmm.PairHMM; -import org.broadinstitute.gatk.utils.recalibration.covariates.RepeatCovariate; -import org.broadinstitute.gatk.utils.recalibration.covariates.RepeatLengthCovariate; +import org.broadinstitute.gatk.engine.recalibration.covariates.RepeatCovariate; +import org.broadinstitute.gatk.engine.recalibration.covariates.RepeatLengthCovariate; import htsjdk.variant.variantcontext.*; import org.testng.Assert; import org.testng.annotations.BeforeSuite; @@ -138,7 +138,7 @@ public class PairHMMLikelihoodCalculationEngineUnitTest extends BaseTest { public void createPcrErrorModelTest(final String repeat, final int repeatLength) { final PairHMMLikelihoodCalculationEngine engine = new PairHMMLikelihoodCalculationEngine((byte)0, - PairHMM.HMM_IMPLEMENTATION.ORIGINAL, 0.0, true, + PairHMM.HMM_IMPLEMENTATION.ORIGINAL, PairHMM.HMM_SUB_IMPLEMENTATION.UNVECTORIZED, false, 0.0, true, PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE); final String readString = Utils.dupString(repeat, repeatLength); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMProbabilityBugIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMProbabilityBugIntegrationTest.java new file mode 100644 index 000000000..4ec4c4cbf --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMProbabilityBugIntegrationTest.java @@ -0,0 +1,92 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.haplotypecaller; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; + +/** + * Test for the Prob > 1 bug in PairHMM using callers. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class PairHMMProbabilityBugIntegrationTest extends WalkerTest { + + private static final File REFERENCE = new File("/humgen/gsa-hpprojects/GATK/bundle/current/hg19/ucsc.hg19.fasta").getAbsoluteFile(); + private static final File BAM = new File (privateTestDir, "pairhmm_prob_bug.bam").getAbsoluteFile(); + private static final File INTERVAL = new File (privateTestDir, "pairhmm_prob_bug.interval.bed").getAbsoluteFile(); + + private static final File UG_BAM = new File(privateTestDir, "pairhmm_prob_bug.ug.bam").getAbsoluteFile(); + private static final File UG_INTERVAL = new File(privateTestDir, "pairhmm_prob_bug.ug.intervals.bed").getAbsoluteFile(); + + + @Test + public void testHaplotypeCaller() { + final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s", + REFERENCE,BAM,INTERVAL); + final String name = getClass().getSimpleName() + ".testHaplotypeCaller"; + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); + executeTest(name, spec); + } + + @Test + public void testUnifiedGenotyper() { + final String commandLine = String.format("-T UnifiedGenotyper -R %s -I %s -L %s -dcov 200 -glm INDEL", + REFERENCE,UG_BAM,UG_INTERVAL); + final String name = getClass().getSimpleName() + ".testUnifiedGenotyper"; + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); + executeTest(name, spec); + } +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java index 2c702d0b1..119cf45fd 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java @@ -52,15 +52,12 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import htsjdk.variant.variantcontext.Allele; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleListUtils; +import org.broadinstitute.gatk.utils.genotyper.SampleListUtils; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.readthreading.HaplotypeGraph; import org.broadinstitute.gatk.utils.collections.Pair; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.haplotype.Haplotype; -import org.broadinstitute.gatk.utils.pairhmm.ActiveRegionTestDataSet; -import org.broadinstitute.gatk.utils.pairhmm.FastLoglessPairHMM; -import org.broadinstitute.gatk.utils.pairhmm.FlexibleHMM; -import org.broadinstitute.gatk.utils.pairhmm.PairHMM; +import org.broadinstitute.gatk.utils.pairhmm.*; import org.broadinstitute.gatk.utils.sam.ClippedGATKSAMRecord; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.testng.Assert; @@ -98,7 +95,8 @@ public class ReadThreadingLikelihoodCalculationEngineUnitTest extends ActiveRegi //final PairHMMLikelihoodCalculationEngine fullPairHMM = new PairHMMLikelihoodCalculationEngine((byte)10, false, // PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, -3); final PairHMMLikelihoodCalculationEngine fullPairHMM = new PairHMMLikelihoodCalculationEngine((byte)10, - PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, Double.NEGATIVE_INFINITY,true, PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.NONE); + PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, PairHMM.HMM_SUB_IMPLEMENTATION.UNVECTORIZED, true, Double.NEGATIVE_INFINITY, + true, PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.NONE); // When using likelihoods it should be around 0.05 since // When using maximum-likelihoods it can be as low as 0.00001 diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java index 6eb207bf1..078177f4a 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java @@ -60,12 +60,15 @@ import org.broadinstitute.gatk.tools.walkers.genotyper.*; import org.broadinstitute.gatk.utils.*; import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleListUtils; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import org.broadinstitute.gatk.utils.variant.HomoSapiensConstants; import org.testng.Assert; @@ -390,7 +393,7 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { refModel.getStart() - call.getStart() + 1), refModel.getReference().getBaseString(), "" + data.getRefHap()); // the reference must be the same. Assert.assertTrue(refModel.getGenotype(0).getGQ() <= 0); // No confidence in the reference hom-ref call across the deletion Assert.assertEquals(refModel.getAlleles().size(),2); // the reference and the lonelly - Assert.assertEquals(refModel.getAlleles().get(1),GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + Assert.assertEquals(refModel.getAlleles().get(1), GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); } else { Assert.assertEquals(refModel, call, "Should have found call " + call + " but found " + refModel + " instead"); } @@ -401,7 +404,7 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { Assert.assertEquals(refModel.getEnd(), loc.getStart() + i); Assert.assertFalse(refModel.hasLog10PError()); Assert.assertEquals(refModel.getAlternateAlleles().size(), 1); - Assert.assertEquals(refModel.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + Assert.assertEquals(refModel.getAlternateAllele(0), GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); Assert.assertTrue(refModel.hasGenotype(sample)); final Genotype g = refModel.getGenotype(sample); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/graphs/KBestHaplotypeFinderUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/graphs/KBestHaplotypeFinderUnitTest.java index b1bc1e22c..33d091953 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/graphs/KBestHaplotypeFinderUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/graphs/KBestHaplotypeFinderUnitTest.java @@ -452,7 +452,7 @@ public class KBestHaplotypeFinderUnitTest extends BaseTest { Cigar expected = new Cigar(); expected.add(new CigarElement(padSize, CigarOperator.M)); if ( ! prefix.equals("") ) expected.add(new CigarElement(prefix.length(), CigarOperator.M)); - for ( final CigarElement elt : TextCigarCodec.getSingleton().decode(midCigar).getCigarElements() ) expected.add(elt); + for ( final CigarElement elt : TextCigarCodec.decode(midCigar).getCigarElements() ) expected.add(elt); if ( ! end.equals("") ) expected.add(new CigarElement(end.length(), CigarOperator.M)); expected.add(new CigarElement(padSize, CigarOperator.M)); expected = AlignmentUtils.consolidateCigar(expected); @@ -513,7 +513,7 @@ public class KBestHaplotypeFinderUnitTest extends BaseTest { public void testLeftAlignCigarSequentiallyAdjacentID() { final String ref = "GTCTCTCTCTCTCTCTCTATATATATATATATATTT"; final String hap = "GTCTCTCTCTCTCTCTCTCTCTATATATATATATTT"; - final Cigar originalCigar = TextCigarCodec.getSingleton().decode("18M4I12M4D2M"); + final Cigar originalCigar = TextCigarCodec.decode("18M4I12M4D2M"); final Cigar result = CigarUtils.leftAlignCigarSequentially(originalCigar, ref.getBytes(), hap.getBytes(), 0, 0); logger.warn("Result is " + result); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealignerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealignerIntegrationTest.java index a66d862ef..c08420fa2 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealignerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealignerIntegrationTest.java @@ -175,12 +175,13 @@ public class IndelRealignerIntegrationTest extends WalkerTest { executeTest("test realigner nWayOut", spec1); } - @Test(expectedExceptions = RuntimeException.class) // because TESTNG wraps UserExceptions in RuntimeExceptions - public void testBadCigarString() { + @Test + public void testBadCigarStringDoesNotFail() { + // Just making sure the test runs without an error, don't care about the MD5 value WalkerTestSpec spec = new WalkerTestSpec( "-T IndelRealigner -R " + b37KGReference + " -I " + privateTestDir + "Realigner.error.bam -L 19:5787200-5787300 -targetIntervals 19:5787205-5787300 -o %s", 1, - Arrays.asList("FAILFAILFAILFAILFAILFAILFAILFAIL")); - executeTest("test bad cigar", spec); + Arrays.asList("")); + executeTest("test bad cigar string does not fail", spec); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/RealignerTargetCreatorIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/RealignerTargetCreatorIntegrationTest.java index c8a2f19aa..ff1ebe32a 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/RealignerTargetCreatorIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/indels/RealignerTargetCreatorIntegrationTest.java @@ -100,4 +100,14 @@ public class RealignerTargetCreatorIntegrationTest extends WalkerTest { Arrays.asList("5206cee6c01b299417bf2feeb8b3dc96")); executeTest("test rods only", spec3); } + + @Test() + public void testBadCigarStringDoesNotFail() { + // Just making sure the test runs without an error, don't care about the MD5 value + WalkerTestSpec spec = new WalkerTestSpec( + "-T RealignerTargetCreator -R " + b37KGReference + " -I " + privateTestDir + "Realigner.error.bam -L 19:5787200-5787300 -o %s", + 1, + Arrays.asList("")); + executeTest("test bad cigar string string does not fail", spec); + } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtilsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtilsUnitTest.java new file mode 100644 index 000000000..26a4a92f7 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/phasing/PhasingUtilsUnitTest.java @@ -0,0 +1,288 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.phasing; + +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.variant.variantcontext.*; +import java.io.File; +import java.io.FileNotFoundException; +import java.util.Arrays; +import java.util.ArrayList; +import java.util.List; +import java.util.HashMap; +import java.util.Map; + +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.Test; + +class AlwaysTrueMergeRule extends PhasingUtils.AlleleMergeRule { + public boolean allelesShouldBeMerged(VariantContext vc1, VariantContext vc2) { + return true; + } +} + +public class PhasingUtilsUnitTest extends BaseTest { + + private final int start = 10; + GenomeLocParser genomeLocParser; + private ReferenceSequenceFile referenceFile; + private Genotype genotype1; + private Genotype genotype2; + private String contig; + private List alleleList1; + private List alleleList2; + VariantContext vc1; + VariantContext vc2; + + @BeforeSuite + public void init() throws FileNotFoundException { + referenceFile = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + genomeLocParser = new GenomeLocParser(referenceFile); + alleleList1 = Arrays.asList(Allele.create("T", true), Allele.create("C", false)); + alleleList2 = Arrays.asList(Allele.create("G", true), Allele.create("A", false)); + genotype1 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-1", "10-2"}).attribute("PQ", 100.0).alleles(alleleList1).make(); + genotype2 = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-2", "10-1"}).attribute("PQ", 200.0).alleles(alleleList2).make(); + contig = new String("1"); + vc1 = new VariantContextBuilder().chr(contig).id("id1").source("TC").start(start).stop(start).alleles(alleleList1).genotypes(genotype1).make(); + vc2 = new VariantContextBuilder().chr(contig).id("id2").source("GA").start(start+1).stop(start+1).alleles(alleleList2).genotypes(genotype2).make(); + } + + @Test + public void TestMatchHaplotypeAllelesKeyHP() { + + final PhasingUtils.SameHaplotypeAlleles sameHaplotypeAlleles = PhasingUtils.matchHaplotypeAlleles(genotype1, genotype2); + + final PhasingUtils.SameHaplotypeAlleles sameHaplotypeAllelesExpected = new PhasingUtils.SameHaplotypeAlleles(); + sameHaplotypeAllelesExpected.hapAlleles.add(new PhasingUtils.AlleleOneAndTwo(Allele.create("T", true), Allele.create("A", false))); + sameHaplotypeAllelesExpected.hapAlleles.add(new PhasingUtils.AlleleOneAndTwo(Allele.create("C", false), Allele.create("G", true))); + sameHaplotypeAllelesExpected.requiresSwap = true; + + Assert.assertEquals(sameHaplotypeAlleles.hapAlleles, sameHaplotypeAllelesExpected.hapAlleles); + Assert.assertEquals(sameHaplotypeAlleles.requiresSwap, sameHaplotypeAllelesExpected.requiresSwap); + } + + @Test + public void TestMatchHaplotypeAllelesNoKeyHP() { + + final Genotype genotypeNoKeyHP1 = new GenotypeBuilder().name("TC").alleles(alleleList1).make(); + final Genotype genotypeNoKeyHP2 = new GenotypeBuilder().name("GA").alleles(alleleList2).make(); + + final PhasingUtils.SameHaplotypeAlleles sameHaplotypeAlleles = PhasingUtils.matchHaplotypeAlleles(genotypeNoKeyHP1, genotypeNoKeyHP2); + final PhasingUtils.SameHaplotypeAlleles sameHaplotypeAllelesExpected = new PhasingUtils.SameHaplotypeAlleles(); + sameHaplotypeAllelesExpected.hapAlleles.add(new PhasingUtils.AlleleOneAndTwo(Allele.create("T", true), Allele.create("G", true))); + sameHaplotypeAllelesExpected.hapAlleles.add(new PhasingUtils.AlleleOneAndTwo(Allele.create("C", false), Allele.create("A", false))); + Assert.assertEquals(sameHaplotypeAlleles.hapAlleles, sameHaplotypeAllelesExpected.hapAlleles); + Assert.assertEquals(sameHaplotypeAlleles.requiresSwap, sameHaplotypeAllelesExpected.requiresSwap); + } + + @Test + public void TestMergeIntoMNPvalidationTrueCheck() { + Assert.assertTrue(PhasingUtils.mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc2)); + } + + @Test + public void TestMergeIntoMNPvalidationCheckLocBefore() { + final VariantContext vc1before = new VariantContextBuilder().chr(contig).id("id1").source("TC").start(start+1).stop(start+1).alleles(alleleList1).make(); + final VariantContext vc2after = new VariantContextBuilder().chr(contig).id("id2").source("GA").start(start).stop(start).alleles(alleleList2).make(); + Assert.assertFalse(PhasingUtils.mergeIntoMNPvalidationCheck(genomeLocParser, vc1before, vc2after)); + } + + @Test + public void TestMergeIntoMNPvalidationFiltered() { + final List filters = Arrays.asList("filter"); + final Genotype genotype = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-1", "10-2"}).alleles(alleleList1).filters(filters).make(); + final VariantContext vc = new VariantContextBuilder().chr(contig).id("id1").source("TC").start(start).stop(start).alleles(alleleList1).genotypes(genotype).make(); + Assert.assertFalse(PhasingUtils.mergeIntoMNPvalidationCheck(genomeLocParser, vc, vc2)); + } + + @Test + public void TestMergeIntoMNPvalidationFilterNoCall() { + final List filters = Arrays.asList("filter"); + final List alleleList = Arrays.asList(Allele.create("T", true), Allele.create(".", false)); + final Genotype genotype = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-1", "10-2"}).alleles(alleleList).filters(filters).make(); + final VariantContext vc = new VariantContextBuilder().chr(contig).id("id1").source("TC").start(start).stop(start).alleles(alleleList1).genotypes(genotype).make(); + Assert.assertFalse(PhasingUtils.mergeIntoMNPvalidationCheck(genomeLocParser, vc, vc2)); + } + + @Test + public void TestMergeIntoMNPvalidationDiffSampleNames() { + final Genotype genotype = new GenotypeBuilder().name("sample1").attribute("HP", new String[]{"10-1", "10-2"}).alleles(alleleList1).make(); + final VariantContext vc = new VariantContextBuilder().chr(contig).id("id1").source("TC").start(start).stop(start).alleles(alleleList1).genotypes(genotype).make(); + Assert.assertFalse(PhasingUtils.mergeIntoMNPvalidationCheck(genomeLocParser, vc, vc2)); + } + + @Test + public void TestMergeIntoMNPvalidationDiffContigs() { + final String contig = new String("2"); + final VariantContext vc = new VariantContextBuilder().chr(contig).id("id1").source("TC").start(start+1).stop(start+1).alleles(alleleList2).genotypes(genotype2).make(); + Assert.assertFalse(PhasingUtils.mergeIntoMNPvalidationCheck(genomeLocParser, vc1, vc)); + } + + @Test + public void TestMergeVariantContextAttributes() { + Assert.assertEquals(0, PhasingUtils.mergeVariantContextAttributes(vc1, vc2).size()); + } + + @Test + public void TestAllSamplesAreMergeable() { + Assert.assertTrue(PhasingUtils.allSamplesAreMergeable(vc1, vc2)); + } + + @Test + public void TestAlleleSegregationIsKnown(){ + Assert.assertTrue(PhasingUtils.alleleSegregationIsKnown(genotype1, genotype2)); + } + + @Test + public void TestSomeSampleHasDoubleNonReferenceAlleleTrue(){ + Genotype genotype = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-1", "10-2"}).alleles(alleleList2).make(); + VariantContext vc = new VariantContextBuilder().chr(contig).id("id2").source("GA").start(start+1).stop(start+1).alleles(alleleList2).genotypes(genotype).make(); + Assert.assertTrue(PhasingUtils.someSampleHasDoubleNonReferenceAllele(vc1, vc)); + } + + @Test + public void TestSomeSampleHasDoubleNonReferenceAlleleFalse(){ + Assert.assertFalse(PhasingUtils.someSampleHasDoubleNonReferenceAllele(vc1, vc2)); + } + + @Test + public void TestDoubleAllelesSegregatePerfectlyAmongSamples(){ + final Genotype genotype = new GenotypeBuilder().name("sample").attribute("HP", new String[]{"10-1", "10-2"}).alleles(alleleList2).make(); + final VariantContext vc = new VariantContextBuilder().chr(contig).id("id2").source("GA").start(start+1).stop(start+1).alleles(alleleList2).genotypes(genotype).make(); + Assert.assertTrue(PhasingUtils.doubleAllelesSegregatePerfectlyAmongSamples(vc1, vc)); + } + + @Test + public void TestMergeVariantContextNames() { + final String result = new String("A_B"); + Assert.assertEquals(result, PhasingUtils.mergeVariantContextNames("A", "B")); + } + + @Test + public void TestMergeIntoMNP(){ + final AlwaysTrueMergeRule alleleMergeRule = new AlwaysTrueMergeRule(); + final VariantContext vc = PhasingUtils.mergeIntoMNP(genomeLocParser, vc1, vc2, referenceFile, alleleMergeRule); + + final List alleleList = Arrays.asList(Allele.create("TG", true), Allele.create("TA", false), Allele.create("CG", false)); + final Map attributes = new HashMap(){{ + put("AC", new ArrayList(Arrays.asList(1, 1))); + put("AF", new ArrayList(Arrays.asList(0.5, 0.5))); + put("AN", 2); + }}; + final Map extendedAttributes = new HashMap(){{ + put("PQ", 100.0); put("HP", new String[]{"10-1", "10-2"}); + }}; + final List alleleListMeged = Arrays.asList(Allele.create("TA"), Allele.create("CG")); + final Genotype genotype = new GenotypeBuilder().name("sample").attributes(extendedAttributes).alleles(alleleListMeged).make(); + final VariantContext vcExpected = new VariantContextBuilder().chr(contig).id("id1;id2").source("TC_GA").start(start).stop(start+1).alleles(alleleList).genotypes(genotype).attributes(attributes).make(); + Assert.assertTrue(genotype.sameGenotype(vcExpected.getGenotypes().get("sample"))); + Assert.assertTrue(vcExpected.hasSameAllelesAs(vc)); + Assert.assertEquals(vcExpected.getChr(), vc.getChr()); + Assert.assertEquals(vcExpected.getStart(), vc.getStart()); + Assert.assertEquals(vcExpected.getEnd(), vc.getEnd()); + Assert.assertEquals(vcExpected.getID(), vc.getID()); + Assert.assertEquals(vcExpected.getSource(), vc.getSource()); + Assert.assertEquals(vcExpected.isFiltered(), vc.isFiltered()); + Assert.assertEquals(vcExpected.getPhredScaledQual(), vc.getPhredScaledQual()); + Assert.assertEquals(vcExpected.getAttribute("PQ"), vc.getAttribute("PQ")); + Assert.assertEquals(vcExpected.getAttribute("HP"), vc.getAttribute("HP")); + } + + @Test + public void TestReallyMergeIntoMNP( ){ + final VariantContext vc = PhasingUtils.reallyMergeIntoMNP(vc1, vc2, referenceFile); + + final List alleleList = Arrays.asList(Allele.create("TG", true), Allele.create("TA", false), Allele.create("CG", false)); + final Map attributes = new HashMap(){{ + put("AC", new ArrayList(Arrays.asList(1, 1))); + put("AF", new ArrayList(Arrays.asList(0.5, 0.5))); + put("AN", 2); + }}; + final Map extendedAttributes = new HashMap(){{ + put("PQ", 100.0); put("HP", new String[]{"10-1", "10-2"}); + }}; + final List alleleListMeged = Arrays.asList(Allele.create("TA"), Allele.create("CG")); + final Genotype genotype = new GenotypeBuilder().name("sample").attributes(extendedAttributes).alleles(alleleListMeged).make(); + final VariantContext vcExpected = new VariantContextBuilder().chr(contig).id("id1;id2").source("TC_GA").start(start).stop(start+1).alleles(alleleList).genotypes(genotype).attributes(attributes).make(); + Assert.assertTrue(genotype.sameGenotype(vcExpected.getGenotypes().get("sample"))); + Assert.assertTrue(vcExpected.hasSameAllelesAs(vc)); + Assert.assertEquals(vcExpected.getChr(), vc.getChr()); + Assert.assertEquals(vcExpected.getStart(), vc.getStart()); + Assert.assertEquals(vcExpected.getEnd(), vc.getEnd()); + Assert.assertEquals(vcExpected.getID(), vc.getID()); + Assert.assertEquals(vcExpected.getSource(), vc.getSource()); + Assert.assertEquals(vcExpected.isFiltered(), vc.isFiltered()); + Assert.assertEquals(vcExpected.getPhredScaledQual(), vc.getPhredScaledQual()); + Assert.assertEquals(vcExpected.getAttribute("PQ"), vc.getAttribute("PQ")); + Assert.assertEquals(vcExpected.getAttribute("HP"), vc.getAttribute("HP")); + } + + @Test + public void TestAllGenotypesAreUnfilteredAndCalled(){ + final VariantContext vc = new VariantContextBuilder().chr(contig).id("id1").source("TC").start(start).stop(start).alleles(alleleList1).genotypes(genotype1).make(); + Assert.assertTrue(PhasingUtils.allGenotypesAreUnfilteredAndCalled(vc)); + } + + @Test + public void TestEnsureMergedAllele(){ + byte[] intermediateBases = new byte[]{'A','T'}; + final PhasingUtils.MergedAllelesData mergeData = new PhasingUtils.MergedAllelesData(intermediateBases, vc1, vc2); + final Allele allele = mergeData.ensureMergedAllele(Allele.create("T", true), Allele.create("C", true)); + final Allele expectedAllele = Allele.create(new byte[]{'T', 'A', 'T', 'C'}, false); + Assert.assertEquals(allele, expectedAllele); + } + +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReadsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReadsIntegrationTest.java index 6ed7b7d0b..c615315a9 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReadsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReadsIntegrationTest.java @@ -91,6 +91,14 @@ public class SplitNCigarReadsIntegrationTest extends WalkerTest { executeTest("test splits with overhangs", spec); } + @Test + public void testSplitsFixNDN() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T SplitNCigarReads -R " + b37KGReference + " -I " + privateTestDir + "splitNCigarReadsSnippet.bam -o %s --no_pg_tag -U ALLOW_N_CIGAR_READS -fixNDN", 1, + Arrays.asList("4ee1c1a64847e2b2f660a3a86f9d7e32")); + executeTest("test fix NDN", spec); + } + @Test public void testSplitsWithOverhangsNotClipping() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReadsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReadsUnitTest.java index d28d8991a..f92afc6f2 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReadsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReadsUnitTest.java @@ -124,7 +124,7 @@ public class SplitNCigarReadsUnitTest extends BaseTest { if(numOfSplits != 0 && isCigarDoesNotHaveEmptyRegionsBetweenNs(cigar)){ final TestManager manager = new TestManager(); - GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar, 0); SplitNCigarReads.splitNCigarRead(read, manager); List splitReads = manager.getReadsInQueueForTesting(); final int expectedReads = numOfSplits+1; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/validation/ValidationSiteSelectorIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/validation/ValidationSiteSelectorIntegrationTest.java index 4c385ab5a..cbe3408de 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/validation/ValidationSiteSelectorIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/validation/ValidationSiteSelectorIntegrationTest.java @@ -84,7 +84,7 @@ public class ValidationSiteSelectorIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(sampleNone + freqUnif + "--variant " + testfile), 1, - Arrays.asList("658c70cbb93faed8ca18e51cd6dd593f") + Arrays.asList("19fe0e3297bfd502911608490222a2fd") ); executeTest("testNoSampleSelectionFreqUniform--" + testfile, spec); @@ -96,7 +96,7 @@ public class ValidationSiteSelectorIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(sampleNone + freqAF + "--variant " + testfile), 1, - Arrays.asList("90411433ea42846352b767da735af53b") + Arrays.asList("91a7f3bf452241040646e61ee6ab1a23") ); executeTest("testNoSampleSelectionFreqAF--" + testfile, spec); @@ -108,7 +108,7 @@ public class ValidationSiteSelectorIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(sampleGT + freqUnif + "--variant " + testfile), 1, - Arrays.asList("2afabd447185cf017f60c85380902117") + Arrays.asList("faae3baf1feb76877fcb81c01b5d44f3") ); executeTest("testPolyGTFreqUniform--" + testfile, spec); @@ -120,7 +120,7 @@ public class ValidationSiteSelectorIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(sampleGT + freqAF + "--variant " + testfile), 1, - Arrays.asList("381e1a2f0e1908b4d7cba5d6361cf5aa") + Arrays.asList("d2ba4e3e76f87e2c1a12d82e7a3dc595") ); executeTest("testPolyGTFreqAF--" + testfile, spec); @@ -132,7 +132,7 @@ public class ValidationSiteSelectorIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(sampleGL + freqAF + "--variant " + testfile), 1, - Arrays.asList("381e1a2f0e1908b4d7cba5d6361cf5aa") + Arrays.asList("d2ba4e3e76f87e2c1a12d82e7a3dc595") ); executeTest("testPolyGLFreqAF--" + testfile, spec); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEvalWalkerUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEvalWalkerUnitTest.java index 13538d304..964d6c151 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEvalWalkerUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEvalWalkerUnitTest.java @@ -55,9 +55,9 @@ package org.broadinstitute.gatk.tools.walkers.varianteval; // the imports for unit testing. import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.varianteval.evaluators.VariantEvaluator; import org.broadinstitute.gatk.tools.walkers.varianteval.stratifications.VariantStratifier; import org.broadinstitute.gatk.tools.walkers.varianteval.stratifications.manager.StratificationManager; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantGaussianMixtureModelUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantGaussianMixtureModelUnitTest.java index 86bebd5a4..872f5f399 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantGaussianMixtureModelUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantGaussianMixtureModelUnitTest.java @@ -73,7 +73,7 @@ public final class VariantGaussianMixtureModelUnitTest extends BaseTest { VariantDatum[] variantData1 = new VariantDatum[N_VARIANTS]; private final File QUAL_DATA = new File(privateTestDir + "tranches.raw.dat"); - private final double[] TRUTH_SENSITIVITY_CUTS = new double[]{99.9, 99.0, 97.0, 95.0}; + private final List TRUTH_SENSITIVITY_CUTS = new ArrayList(Arrays.asList(99.9, 99.0, 97.0, 95.0)); private final File EXPECTED_TRANCHES_NEW = new File(privateTestDir + "tranches.6.txt"); private final File EXPECTED_TRANCHES_OLD = new File(privateTestDir + "tranches.4.txt"); @@ -136,7 +136,7 @@ public final class VariantGaussianMixtureModelUnitTest extends BaseTest { } } - private static List findMyTranches(ArrayList vd, double[] tranches) { + private static List findMyTranches(ArrayList vd, List tranches) { final int nCallsAtTruth = TrancheManager.countCallsAtTruth( vd, Double.NEGATIVE_INFINITY ); final TrancheManager.SelectionMetric metric = new TrancheManager.TruthSensitivityMetric( nCallsAtTruth ); return TrancheManager.findTranches(vd, tranches, metric, VariantRecalibratorArgumentCollection.Mode.SNP); @@ -153,6 +153,6 @@ public final class VariantGaussianMixtureModelUnitTest extends BaseTest { @Test(expectedExceptions = {UserException.class}) public final void testBadFDR() { ArrayList vd = readData(); - List tranches = findMyTranches(vd, new double[]{-1}); + List tranches = findMyTranches(vd, new ArrayList(Arrays.asList(-1.0))); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index e47b26758..3d5463d0f 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -51,9 +51,8 @@ package org.broadinstitute.gatk.tools.walkers.variantrecalibration; +import org.broadinstitute.gatk.utils.variant.VCIterable; import org.broadinstitute.gatk.engine.walkers.WalkerTest; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; -import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFCodec; import org.testng.Assert; @@ -340,7 +339,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { final List outputFiles = executeTest("testApplyRecalibrationSnpAndIndelTogether", spec).getFirst(); setPDFsForDeletion(outputFiles); final File VCF = outputFiles.get(0); - for( final VariantContext VC : GATKVCFUtils.readAllVCs(VCF, new VCFCodec()).getSecond() ) { + for( final VariantContext VC : VCIterable.readAllVCs(VCF, new VCFCodec()).getSecond() ) { if( VC != null ) { Assert.assertTrue(VC.isNotFiltered()); // there should only be unfiltered records in the output VCF file } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CalculateGenotypePosteriorsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CalculateGenotypePosteriorsIntegrationTest.java index 9b659a89b..7f860f343 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CalculateGenotypePosteriorsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CalculateGenotypePosteriorsIntegrationTest.java @@ -61,6 +61,8 @@ public class CalculateGenotypePosteriorsIntegrationTest extends WalkerTest { private static String CEUtrioFamilyFile = privateTestDir + "CEUtrio.ped"; private static String CEUtrioTest = privateTestDir + "CEUtrioTest.vcf"; private static String CEUtrioPopPriorsTest = privateTestDir + "CEUtrioPopPriorsTest.vcf"; + private static String threeMemberNonTrioFamilyFile = privateTestDir + "threeMemberNonTrio.ped"; + private static String getThreeMemberNonTrioTest = privateTestDir + "threeMemberNonTrioTest.vcf"; @Test(enabled = true) //use the first 20 variants to save time; they have a nice range of AC from 4 to over 4000 @@ -119,5 +121,19 @@ public class CalculateGenotypePosteriorsIntegrationTest extends WalkerTest { executeTest("testFamilyPriors", spec); } + @Test(enabled = true) + public void testSingleParentFamily() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T CalculateGenotypePosteriors --no_cmdline_in_header " + + " -o %s" + + " -R " + b37KGReference + + " -ped " + threeMemberNonTrioFamilyFile + + " -V " + getThreeMemberNonTrioTest + + " -skipPop", + 1, + Arrays.asList("abfa4332bce9aba911ad2eba34ee9924")); + executeTest("testFamilyPriors", spec); + } -} \ No newline at end of file + +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java index 891ad8d38..c9fe0eda2 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java @@ -52,7 +52,7 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; import org.broadinstitute.gatk.engine.walkers.WalkerTest; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import htsjdk.variant.variantcontext.VariantContext; import org.testng.Assert; import org.testng.annotations.Test; @@ -100,7 +100,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "tetraploid-gvcf-3.vcf" + " -L " + privateTestDir + "tetraploid-gvcfs.intervals", 1, - Arrays.asList("20f55be01d01bed48bf66f354fa72e5b")); + Arrays.asList("ebe26077809961f53d5244643d24fd45")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -112,7 +112,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "diploid-gvcf-3.vcf" + " -L " + privateTestDir + "tetraploid-gvcfs.intervals", 1, - Arrays.asList("c8bf3da5eb641d0082bdd5f12ea58e1e")); + Arrays.asList("2d36a5f996cad47e5d05fcd78f6e572e")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -190,7 +190,7 @@ public class CombineGVCFsIntegrationTest extends WalkerTest { @Test public void testMD5s() throws Exception { final String cmd = baseTestString(" -L 1:69485-69791"); - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("1df56fdfc71729cc82ba5dbfc75a72c4")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("83ea9f4a9aadb1218c21c9d3780e8009")); spec.disableShadowBCF(); executeTest("testMD5s", spec); } @@ -198,15 +198,44 @@ public class CombineGVCFsIntegrationTest extends WalkerTest { @Test public void testBasepairResolutionOutput() throws Exception { final String cmd = baseTestString(" -L 1:69485-69791 --convertToBasePairResolution"); - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("9c8fc4d9e330fbe41a00b7f71a784f4e")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("f153cb6e986efc9b50f0b8833fe5d3da")); spec.disableShadowBCF(); executeTest("testBasepairResolutionOutput", spec); } + @Test + public void testBreakBlocks() throws Exception { + final String cmd = baseTestString(" -L 1:69485-69791 --breakBandsAtMultiplesOf 5"); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("6626ff272e7e76fba091f5bde4a1f963")); + spec.disableShadowBCF(); + executeTest("testBreakBlocks", spec); + } + + @Test + public void testSpanningDeletions() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + privateTestDir + "spanningDel.1.g.vcf -V " + privateTestDir + "spanningDel.2.g.vcf", + 1, + Arrays.asList("fba48ce2bf8761366ff2cd0b45d0421f")); + spec.disableShadowBCF(); + executeTest("testSpanningDeletions", spec); + } + + @Test + public void testWrongReferenceBaseBugFix() throws Exception { + final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -V " + (privateTestDir + "combine-gvcf-wrong-ref-input1.vcf" + + " -V " + (privateTestDir + "combine-gvcf-wrong-ref-input2.vcf") + " -o %s --no_cmdline_in_header"); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("331c1a4a6a72ea1617c1697a5d945d56")); + spec.disableShadowBCF(); + executeTest("testWrongReferenceBaseBugFix",spec); + + } + @Test public void testBasepairResolutionInput() throws Exception { final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -V " + privateTestDir + "gvcf.basepairResolution.vcf"; - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("6116f3c70cd5288f3e8b89b1953a1e15")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("207e89b5677fbf0ef4d1ff768262cf0c")); spec.disableShadowBCF(); executeTest("testBasepairResolutionInput", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariantsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariantsUnitTest.java index a301adc02..d3a145be6 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariantsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariantsUnitTest.java @@ -52,7 +52,7 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; import htsjdk.tribble.readers.PositionalBufferedStream; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import htsjdk.variant.vcf.VCFCodec; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ConcordanceMetricsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ConcordanceMetricsUnitTest.java index 38ac62287..c7043bad9 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ConcordanceMetricsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/ConcordanceMetricsUnitTest.java @@ -742,4 +742,45 @@ public class ConcordanceMetricsUnitTest extends BaseTest { Assert.assertEquals(metrics.getOverallSiteConcordance().get(ConcordanceMetrics.SiteConcordanceType.EVAL_SUPERSET_TRUTH),1); } + + private Pair getMonoallelicData() { + + final Allele ref = Allele.create(BaseUtils.Base.T.base,true); + final Allele alt = Allele.create(BaseUtils.Base.C.base); + + //Site in eval is monoallelic, both samples are HOM_REF + //sample1 in comp is HOM_VAR, sample2 is NO_CALL + //None of these should trigger mismatching alleles + final GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1",1,1); + final VariantContextBuilder site1Comp = new VariantContextBuilder(); + final VariantContextBuilder site1Eval = new VariantContextBuilder(); + site1Comp.loc(loc.getContig(), loc.getStart(), loc.getStop()); + site1Eval.loc(loc.getContig(), loc.getStart(), loc.getStop()); + site1Comp.alleles(Arrays.asList(ref)); + site1Eval.alleles(Arrays.asList(ref, alt)); + site1Comp.genotypes(GenotypeBuilder.create("test2_sample1", Arrays.asList(ref, ref)), + GenotypeBuilder.create("test2_sample2", Arrays.asList(ref, ref))); + site1Eval.genotypes(GenotypeBuilder.create("test2_sample1",Arrays.asList(alt,alt)), + GenotypeBuilder.create("test2_sample2",Arrays.asList(Allele.NO_CALL,Allele.NO_CALL))); + + return new Pair<>(site1Eval.make(), site1Comp.make()); + } + + @Test + public void testMonoallelicSite() { + final Pair data = getMonoallelicData(); + final VariantContext eval = data.getFirst(); + final VariantContext truth = data.getSecond(); + final VCFCodec codec = new VCFCodec(); + final VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); + final VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); + final ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader,null); + metrics.update(eval,truth); + + + Assert.assertEquals(metrics.getGenotypeConcordance("test2_sample1").getnMismatchingAlt(),0); + Assert.assertEquals(metrics.getGenotypeConcordance("test2_sample2").getnMismatchingAlt(),0); + Assert.assertEquals(metrics.getGenotypeConcordance("test2_sample1").getTable()[3][1],1); + Assert.assertEquals(metrics.getGenotypeConcordance("test2_sample2").getTable()[0][1],1); + } } \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java index d4ebc3bc7..d1ac6eca0 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -51,10 +51,21 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; +import htsjdk.tribble.readers.LineIterator; +import htsjdk.tribble.readers.PositionalBufferedStream; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFCodec; import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.testng.Assert; import org.testng.annotations.Test; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; import java.util.Arrays; +import java.util.ArrayList; +import java.util.List; public class GenotypeGVCFsIntegrationTest extends WalkerTest { @@ -71,10 +82,19 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf", b37KGReference), 1, - Arrays.asList("27bc40f7cc46bdc347284d7522b2aa6c")); + Arrays.asList("4dfea9a9b1a77c4c6b9edc61f9ea8da2")); executeTest("testUpdatePGT", spec); } + @Test(enabled = true) + public void testUpdatePGTStrandAlleleCountsBySample() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf -A StrandAlleleCountsBySample", b37KGReference), + 1, + Arrays.asList("a96b79e7c3689c8d5506083cb6d27390")); + executeTest("testUpdatePGT, adding StrandAlleleCountsBySample annotation", spec); + } + @Test(enabled = true) public void combineSingleSamplePipelineGVCF() { WalkerTestSpec spec = new WalkerTestSpec( @@ -83,7 +103,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-20,000,000", b37KGReference), 1, - Arrays.asList("bb7775a555ee9859e18a28cbc044a160")); + Arrays.asList("bf3c1982ab6ffee410cb6a1fff6e7105")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -95,7 +115,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "tetraploid-gvcf-3.vcf" + " -L " + privateTestDir + "tetraploid-gvcfs.intervals", b37KGReference), 1, - Arrays.asList("a2e482cddbc987b0ba004e13044f6e81")); + Arrays.asList("47d454936dc1f17cf4c4f84f02841346")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -107,7 +127,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "diploid-gvcf-3.vcf" + " -L " + privateTestDir + "tetraploid-gvcfs.intervals", b37KGReference), 1, - Arrays.asList("0ad7d784a15ad7f8b386ec7ca34032af")); + Arrays.asList("5d79ea9de8ada8520d01284cf0c9f720")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -119,7 +139,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " --includeNonVariantSites -L 20:10,030,000-10,033,000 -L 20:10,386,000-10,386,500", b37KGReference), 1, - Arrays.asList("fdd06679c8a14ef2010d075cbae76519")); + Arrays.asList("d69b43cac448f45218e77308fc01e9e6")); executeTest("combineSingleSamplePipelineGVCF_includeNonVariants", spec); } @@ -132,7 +152,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-20,000,000", b37KGReference), 1, - Arrays.asList("9b7f2ba1bde2e0a0eb3ebc0afb6bc513")); + Arrays.asList("7c93d82758bfb6e7efec257ef8a46217")); executeTest("combineSingleSamplePipelineGVCFHierarchical", spec); } @@ -144,7 +164,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-11,000,000 --dbsnp " + b37dbSNP132, b37KGReference), 1, - Arrays.asList("8201cee7120dfdb3fdeace0ec511c7b1")); + Arrays.asList("5b60a7a9575ea83407aa61123960a0cc")); executeTest("combineSingleSamplePipelineGVCF_addDbsnp", spec); } @@ -154,7 +174,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { "-T GenotypeGVCFs --no_cmdline_in_header -L 1:69485-69791 -o %s -R " + b37KGReference + " -V " + privateTestDir + "gvcfExample1.vcf", 1, - Arrays.asList("b4bb1d21c7a3d793a98b0857c7c5d52b")); + Arrays.asList("9e59b94c84dd673b8db9d35cae7e0f68")); executeTest("testJustOneSample", spec); } @@ -165,23 +185,24 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V " + privateTestDir + "gvcfExample1.vcf" + " -V " + privateTestDir + "gvcfExample2.vcf", 1, - Arrays.asList("ec63a629cc707554d3dd2ba7254b3b8d")); + Arrays.asList("8407cb9a1ab34e705e5a54a0d4146d84")); executeTest("testSamplesWithDifferentLs", spec); } @Test(enabled = true) public void testNoPLsException() { // Test with input files with (1) 0/0 and (2) ./. + final String md5 = "3e69805dc1c0ada0a050a65b89ecab30"; WalkerTestSpec spec1 = new WalkerTestSpec( "-T GenotypeGVCFs --no_cmdline_in_header -L 1:1115550-1115551 -o %s -R " + hg19Reference + " --variant " + privateTestDir + "combined_genotype_gvcf_exception.vcf", 1, - Arrays.asList("9626a7108d616d63a2a8069b306c1fe0")); + Arrays.asList(md5)); WalkerTestSpec spec2 = new WalkerTestSpec( "-T GenotypeGVCFs --no_cmdline_in_header -L 1:1115550-1115551 -o %s -R " + hg19Reference + " --variant " + privateTestDir + "combined_genotype_gvcf_exception.nocall.vcf", 1, - Arrays.asList("9626a7108d616d63a2a8069b306c1fe0")); + Arrays.asList(md5)); executeTest("testNoPLsException.1", spec1); executeTest("testNoPLsException.2", spec2); } @@ -191,7 +212,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseBPResolutionString("-nda"), 1, - Arrays.asList("d50e5035488f63c574dcb8485ff61fcb")); + Arrays.asList("5a036de16b7a87626d2b76727376d9df")); executeTest("testNDA", spec); } @@ -200,7 +221,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseBPResolutionString("-maxAltAlleles 1"), 1, - Arrays.asList("8fa78191298b4d8c9b40fba2c705ad56")); + Arrays.asList("2f3e6879fa27128a8be7b067ded78966")); executeTest("testMaxAltAlleles", spec); } @@ -209,7 +230,265 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseBPResolutionString("-stand_call_conf 300 -stand_emit_conf 100"), 1, - Arrays.asList("bd58c026e9c8df4d4166f22cd0f0ce65")); + Arrays.asList("2e4a1ad71e8fc127b594077166c0344b")); executeTest("testStandardConf", spec); } + + @Test + public void testStrandAlleleCountsBySample() throws IOException { + //HaplotypeCaller creates gVCF + final String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; + final WalkerTestSpec specHaplotypeCaller = new WalkerTestSpec( + "-T HaplotypeCaller --disableDithering " + + String.format("-R %s -I %s ", b37KGReference, CEUTRIO_BAM) + + "--no_cmdline_in_header -o %s -L 20:10130000-10134800 " + + "-ERC GVCF --sample_name NA12878 -variant_index_type LINEAR " + + "-variant_index_parameter 128000 -A StrandAlleleCountsBySample", + 1, Arrays.asList("") + ); + specHaplotypeCaller.disableShadowBCF(); //TODO: Remove when BaseTest.assertAttributesEquals() works with SC + final File gVCF = executeTest("testStrandAlleleCountsBySampleHaplotypeCaller", specHaplotypeCaller).getFirst().get(0); + List gVCFList = getAttributeValues(gVCF, new String("SAC")); + + //Use gVCF from HaplotypeCaller + final WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -V " + gVCF.getAbsolutePath(), b37KGReference), + 1, + Arrays.asList("")); + final File outputVCF = executeTest("testStrandAlleleCountsBySample", spec).getFirst().get(0); + List outputVCFList = getAttributeValues(outputVCF, new String("SAC")); + + // All of the SAC values in the VCF were derived from the gVCF + Assert.assertTrue(gVCFList.containsAll(outputVCFList)); + } + + @Test + public void testUniquifiedSamples() throws IOException { + //two copies of 5 samples; will also test InbreedingCoeff calculation for uniquified samples + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" + + " -V:sample1B " + privateTestDir + "combine.single.sample.pipeline.1.vcf" + + " -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" + + " -V:sample2B " + privateTestDir + "combine.single.sample.pipeline.2.vcf" + + " -V:combined1 " + privateTestDir + "combine.single.sample.pipeline.combined.vcf" + + " -V:combined2 " + privateTestDir + "combine.single.sample.pipeline.combined.vcf" + + " --uniquifySamples", b37KGReference), + 1, + Arrays.asList("9a472c4e101fff4892efb9255c5cd8b3")); + executeTest("testUniquifiedSamples", spec); + + } + + /** + * Returns a list of attribute values from a VCF file + * + * @param vcfFile VCF file + * @param attributeName attribute name + * + * @throws IOException if the file does not exist or can not be opened + * + * @return list of attribute values + */ + private List getAttributeValues(final File vcfFile, final String attributeName) throws IOException { + final VCFCodec codec = new VCFCodec(); + final FileInputStream s = new FileInputStream(vcfFile); + final LineIterator lineIteratorVCF = codec.makeSourceFromStream(new PositionalBufferedStream(s)); + codec.readHeader(lineIteratorVCF); + + List attributeValues = new ArrayList(); + while (lineIteratorVCF.hasNext()) { + final String line = lineIteratorVCF.next(); + Assert.assertFalse(line == null); + final VariantContext vc = codec.decode(line); + + for (final Genotype g : vc.getGenotypes()) { + if (g.hasExtendedAttribute(attributeName)) { + attributeValues.add((String) g.getExtendedAttribute(attributeName)); + } + } + } + + return attributeValues; + } + + /** + * Section to test spanning deletions + */ + @Test + public void testSpanningDeletions() throws IOException { + final String gvcf1 = privateTestDir + "spanningDel.1.g.vcf"; + final String gvcf2 = privateTestDir + "spanningDel.2.g.vcf"; + final String gvcf3 = privateTestDir + "spanningDel.3.g.vcf"; + + // create the genotyped VCF to use as a basis for comparison against all of the combined versions + // case 0: GenotypeGVCFs(1.g.vcf, 2.g.vcf, 3.g.vcf) + final WalkerTestSpec genotypeBase = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + gvcf1 + " -V " + gvcf2 + " -V " + gvcf3, + 1, + Arrays.asList("")); + genotypeBase.disableShadowBCF(); + final File genotypeBaseVCF = executeTest("genotypeBase", genotypeBase).getFirst().get(0); + final List BASE_VARIANT_CONTEXTS = getVariantContexts(genotypeBaseVCF); + + // case 1: GenotypeGVCFs(CombineGVCFs(1.g.vcf, 2.g.vcf), 3.g.vcf) + final WalkerTestSpec combine12 = new WalkerTestSpec( + "-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + gvcf1 + " -V " + gvcf2, + 1, + Arrays.asList("")); + combine12.disableShadowBCF(); + final File combined_gVCF12 = executeTest("combine12", combine12).getFirst().get(0); + final WalkerTestSpec genotype12_3 = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + combined_gVCF12.getAbsolutePath() + " -V " + gvcf3, + 1, + Arrays.asList("")); + genotype12_3.disableShadowBCF(); + final File genotype12_3VCF = executeTest("genotype12_3", genotype12_3).getFirst().get(0); + final List VARIANT_CONTEXTS_12_3 = getVariantContexts(genotype12_3VCF); + testVCsAreEqual(BASE_VARIANT_CONTEXTS, VARIANT_CONTEXTS_12_3); + + // case 2: GenotypeGVCFs(CombineGVCFs(CombineGVCFs(1.g.vcf, 2.g.vcf), 3.g.vcf)) + final WalkerTestSpec combine12then3 = new WalkerTestSpec( + "-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + combined_gVCF12 + " -V " + gvcf3, + 1, + Arrays.asList("")); + combine12then3.disableShadowBCF(); + final File combined_gVCF12then3 = executeTest("combined_gVCF12then3", combine12then3).getFirst().get(0); + final WalkerTestSpec genotype12then3 = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + combined_gVCF12then3.getAbsolutePath(), + 1, + Arrays.asList("")); + genotype12then3.disableShadowBCF(); + final File genotype12then3VCF = executeTest("genotype12then3", genotype12then3).getFirst().get(0); + final List VARIANT_CONTEXTS_12then3 = getVariantContexts(genotype12then3VCF); + testVCsAreEqual(BASE_VARIANT_CONTEXTS, VARIANT_CONTEXTS_12then3); + + // case 3: GenotypeGVCFs(CombineGVCFs(CombineGVCFs(1.g.vcf, 3.g.vcf), 2.g.vcf)) + final WalkerTestSpec combine13 = new WalkerTestSpec( + "-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + gvcf1 + " -V " + gvcf3, + 1, + Arrays.asList("")); + combine13.disableShadowBCF(); + final File combined_gVCF13 = executeTest("combine13", combine13).getFirst().get(0); + final WalkerTestSpec combine13then2 = new WalkerTestSpec( + "-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + combined_gVCF13 + " -V " + gvcf2, + 1, + Arrays.asList("")); + combine13then2.disableShadowBCF(); + final File combined_gVCF13then2 = executeTest("combined_gVCF13then2", combine13then2).getFirst().get(0); + final WalkerTestSpec genotype13then2 = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + combined_gVCF13then2.getAbsolutePath(), + 1, + Arrays.asList("")); + genotype13then2.disableShadowBCF(); + final File genotype13then2VCF = executeTest("genotype13then2", genotype13then2).getFirst().get(0); + final List VARIANT_CONTEXTS_13then2 = getVariantContexts(genotype13then2VCF); + testVCsAreEqual(BASE_VARIANT_CONTEXTS, VARIANT_CONTEXTS_13then2); + + // case 4: GenotypeGVCFs(CombineGVCFs(1.g.vcf, 2.g.vcf, 3.g.vcf)) + final WalkerTestSpec combine123 = new WalkerTestSpec( + "-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + gvcf1 + " -V " + gvcf2 + " -V " + gvcf3, + 1, + Arrays.asList("")); + combine123.disableShadowBCF(); + final File combined_gVCF123 = executeTest("combine123", combine123).getFirst().get(0); + final WalkerTestSpec genotype123 = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + combined_gVCF123.getAbsolutePath(), + 1, + Arrays.asList("")); + genotype123.disableShadowBCF(); + final File genotype123VCF = executeTest("genotype123", genotype123).getFirst().get(0); + final List VARIANT_CONTEXTS_123 = getVariantContexts(genotype123VCF); + testVCsAreEqual(BASE_VARIANT_CONTEXTS, VARIANT_CONTEXTS_123); + } + + /** + * Returns a list of VariantContext records from a VCF file + * + * @param vcfFile VCF file + * + * @throws IOException if the file does not exist or can not be opened + * + * @return list of VariantContext records + */ + private static List getVariantContexts(final File vcfFile) throws IOException { + final VCFCodec codec = new VCFCodec(); + final FileInputStream s = new FileInputStream(vcfFile); + final LineIterator lineIteratorVCF = codec.makeSourceFromStream(new PositionalBufferedStream(s)); + codec.readHeader(lineIteratorVCF); + + final List VCs = new ArrayList<>(); + while ( lineIteratorVCF.hasNext() ) { + final String line = lineIteratorVCF.next(); + Assert.assertFalse(line == null); + VCs.add(codec.decode(line)); + } + + return VCs; + } + + private static void testVCsAreEqual(final List VCs1, final List VCs2) { + Assert.assertEquals(VCs1.size(), VCs2.size(), "number of Variant Contexts"); + for ( int i = 0; i < VCs1.size(); i++ ) { + final VariantContext vc1 = VCs1.get(i); + final VariantContext vc2 = VCs2.get(i); + Assert.assertEquals(vc1.toStringDecodeGenotypes(), vc2.toStringDecodeGenotypes()); + } + } + + + private static final String simpleSpanningDeletionsMD5 = "e8616a396d40b4918ad30189856ceb01"; + + @Test(enabled = true) + public void testSpanningDeletionsMD5() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + privateTestDir + "spanningDel.1.g.vcf -V " + privateTestDir + "spanningDel.2.g.vcf", + 1, + Arrays.asList(simpleSpanningDeletionsMD5)); + spec.disableShadowBCF(); + executeTest("testSpanningDeletionsMD5", spec); + } + + @Test(enabled = true) + public void testSpanningDeletionsFromCombinedGVCF() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + privateTestDir + "spanningDel.combined.g.vcf", + 1, + Arrays.asList(simpleSpanningDeletionsMD5)); + spec.disableShadowBCF(); + executeTest("testSpanningDeletionsFromCombinedGVCFMD5", spec); + } + + @Test(enabled = true) + public void testMultipleSpanningDeletionsMD5() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + privateTestDir + "spanningDel.1.g.vcf -V " + privateTestDir + "spanningDel.2.g.vcf -V " + privateTestDir + "spanningDel.3.g.vcf", + 1, + Arrays.asList("1c418229117bc8f148a69eda9c496309")); + spec.disableShadowBCF(); + executeTest("testMultipleSpanningDeletionsMD5", spec); + } + + @Test(enabled = true) + public void testSpanningDeletionDoesNotGetGenotypedWithNoOtherAlleles() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + + " -V " + privateTestDir + "spanningDel.delOnly.g.vcf", + 1, + Arrays.asList("46169d08f93e5ff57856c7b64717314b")); + spec.disableShadowBCF(); + executeTest("testSpanningDeletionDoesNotGetGenotypedWithNoOtherAlleles", spec); + } } \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java index f72bfc415..b9cfc0949 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java @@ -51,29 +51,22 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMReadGroupRecord; import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import org.testng.Assert; -import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.io.File; -import java.io.FileNotFoundException; import java.util.*; /** diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/PosteriorLikelihoodsUtilsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/PosteriorLikelihoodsUtilsUnitTest.java index 9cfd9ae1d..e24623998 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/PosteriorLikelihoodsUtilsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/PosteriorLikelihoodsUtilsUnitTest.java @@ -62,6 +62,7 @@ import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.MathUtils; import htsjdk.variant.variantcontext.*; import htsjdk.variant.vcf.VCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import org.testng.Assert; import org.testng.annotations.BeforeSuite; import org.testng.annotations.Test; @@ -74,7 +75,6 @@ import java.util.List; public class PosteriorLikelihoodsUtilsUnitTest extends BaseTest { Allele Aref, T, C, G, Cref, ATC, ATCATC; - private final String PHRED_SCALED_POSTERIORS_KEY = "PP"; @BeforeSuite public void setup() { @@ -151,16 +151,16 @@ public class PosteriorLikelihoodsUtilsUnitTest extends BaseTest { makeG("s10",Aref,T,20,0,10), makeG("s11",T,T,60,40,0), makeG("s12",Aref,Aref,0,30,90)); - test1 = new VariantContextBuilder(test1).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,12).make(); + test1 = new VariantContextBuilder(test1).attribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY,12).make(); VariantContext test1result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(test1, new ArrayList(), 0, 0.001, true, false, false); Genotype test1exp1 = makeGwithPLs("s1",Aref,T,new double[]{-2.26110257, -0.02700903, -1.26110257}); Assert.assertTrue(test1exp1.hasPL()); Genotype test1exp2 = makeGwithPLs("s2",T,T,new double[]{-6.000075e+00, -3.765981e+00, -7.488009e-05}); Genotype test1exp3 = makeGwithPLs("s3",Aref,Aref,new double[]{-0.0007438855, -2.7666503408, -9.0007438855}); - Assert.assertEquals("java.util.ArrayList",test1result.getGenotype(0).getAnyAttribute(PHRED_SCALED_POSTERIORS_KEY).getClass().getCanonicalName()); - Assert.assertEquals(arraysEq(test1exp1.getPL(), _mleparse((List)test1result.getGenotype(0).getAnyAttribute(PHRED_SCALED_POSTERIORS_KEY))), ""); - Assert.assertEquals(arraysEq(test1exp2.getPL(),_mleparse((List)test1result.getGenotype(1).getAnyAttribute(PHRED_SCALED_POSTERIORS_KEY))), ""); - Assert.assertEquals(arraysEq(test1exp3.getPL(),_mleparse((List)test1result.getGenotype(2).getAnyAttribute(PHRED_SCALED_POSTERIORS_KEY))), ""); + Assert.assertEquals("java.util.ArrayList",test1result.getGenotype(0).getAnyAttribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY).getClass().getCanonicalName()); + Assert.assertEquals(arraysEq(test1exp1.getPL(), _mleparse((List)test1result.getGenotype(0).getAnyAttribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test1exp2.getPL(),_mleparse((List)test1result.getGenotype(1).getAnyAttribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test1exp3.getPL(),_mleparse((List)test1result.getGenotype(2).getAnyAttribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY))), ""); // AA AB BB AC BC CC // AA AC CC AT CT TT @@ -177,16 +177,16 @@ public class PosteriorLikelihoodsUtilsUnitTest extends BaseTest { makeG("s10",Aref,C,40,0,10,30,40,80), makeG("s11",Aref,Aref,0,5,8,15,20,40), makeG("s12",C,T,80,40,12,20,0,10)); - test2 = new VariantContextBuilder(test2).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,new ArrayList(Arrays.asList(6,6))).make(); + test2 = new VariantContextBuilder(test2).attribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY,new ArrayList(Arrays.asList(6,6))).make(); VariantContext test2result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(test2,new ArrayList(),5,0.001,true,false, false); Genotype test2exp1 = makeGwithPLs("s1",Aref,T,new double[]{-2.823957, -1.000000, -6.686344, 0.000000, -1.952251, -9.686344}); Genotype test2exp2 = makeGwithPLs("s2",Aref,C,new double[]{-3.823957, 0.000000, -1.686344, -3.000000, -4.452251, -8.686344}); Genotype test2exp3 = makeGwithPLs("s3",Aref,Aref,new double[] {0.000000, -0.676043, -1.662387, -1.676043, -2.628294, -4.862387}); Genotype test2exp4 = makeGwithPLs("s4",C,T,new double[]{-7.371706, -3.547749, -1.434094, -1.547749, 0.000000, -1.234094}); - Assert.assertEquals(arraysEq(test2exp1.getPL(),(int[]) _mleparse((List)test2result.getGenotype(0).getAnyAttribute(PHRED_SCALED_POSTERIORS_KEY))), ""); - Assert.assertEquals(arraysEq(test2exp2.getPL(),(int[]) _mleparse((List)test2result.getGenotype(1).getAnyAttribute(PHRED_SCALED_POSTERIORS_KEY))), ""); - Assert.assertEquals(arraysEq(test2exp3.getPL(),(int[]) _mleparse((List)test2result.getGenotype(2).getAnyAttribute(PHRED_SCALED_POSTERIORS_KEY))), ""); - Assert.assertEquals(arraysEq(test2exp4.getPL(),(int[]) _mleparse((List)test2result.getGenotype(3).getAnyAttribute(PHRED_SCALED_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test2exp1.getPL(),(int[]) _mleparse((List)test2result.getGenotype(0).getAnyAttribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test2exp2.getPL(),(int[]) _mleparse((List)test2result.getGenotype(1).getAnyAttribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test2exp3.getPL(),(int[]) _mleparse((List)test2result.getGenotype(2).getAnyAttribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test2exp4.getPL(),(int[]) _mleparse((List)test2result.getGenotype(3).getAnyAttribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY))), ""); } @Test @@ -195,7 +195,7 @@ public class PosteriorLikelihoodsUtilsUnitTest extends BaseTest { makeG("s2",Aref,T,18,0,24), makeG("s3",Aref,T,22,0,12)); List supplTest1 = new ArrayList<>(3); - supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T))).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,2).attribute(VCFConstants.ALLELE_NUMBER_KEY,10).make()); + supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T))).attribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY,2).attribute(VCFConstants.ALLELE_NUMBER_KEY,10).make()); supplTest1.add(new VariantContextBuilder(makeVC("3",Arrays.asList(Aref,T))).attribute(VCFConstants.ALLELE_COUNT_KEY,4).attribute(VCFConstants.ALLELE_NUMBER_KEY,22).make()); supplTest1.add(makeVC("4",Arrays.asList(Aref,T), makeG("s_1",T,T), @@ -205,25 +205,25 @@ public class PosteriorLikelihoodsUtilsUnitTest extends BaseTest { Genotype test1exp1 = makeGwithPLs("t1",T,T,new double[]{-3.370985, -1.415172, -0.01721766}); Genotype test1exp2 = makeGwithPLs("t2",Aref,T,new double[]{-1.763792, -0.007978791, -3.010024}); Genotype test1exp3 = makeGwithPLs("t3",Aref,T,new double[]{-2.165587, -0.009773643, -1.811819}); - Assert.assertEquals(arraysEq(test1exp1.getPL(),_mleparse((List) test1result.getGenotype(0).getAnyAttribute(PHRED_SCALED_POSTERIORS_KEY))), ""); - Assert.assertEquals(arraysEq(test1exp2.getPL(),_mleparse((List) test1result.getGenotype(1).getAnyAttribute(PHRED_SCALED_POSTERIORS_KEY))), ""); - Assert.assertEquals(arraysEq(test1exp3.getPL(),_mleparse((List) test1result.getGenotype(2).getAnyAttribute(PHRED_SCALED_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test1exp1.getPL(),_mleparse((List) test1result.getGenotype(0).getAnyAttribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test1exp2.getPL(),_mleparse((List) test1result.getGenotype(1).getAnyAttribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test1exp3.getPL(),_mleparse((List) test1result.getGenotype(2).getAnyAttribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY))), ""); VariantContext testNonOverlapping = makeVC("1", Arrays.asList(Aref,T), makeG("s1",T,T,3,1,0)); List other = Arrays.asList(makeVC("2",Arrays.asList(Aref,C),makeG("s2",C,C,10,2,0))); VariantContext test2result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(testNonOverlapping,other,0,0.001,true,false,false); Genotype test2exp1 = makeGwithPLs("SGV",T,T,new double[]{-4.078345, -3.276502, -0.0002661066}); - Assert.assertEquals(arraysEq(test2exp1.getPL(),_mleparse((List) test2result.getGenotype(0).getAnyAttribute(PHRED_SCALED_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test2exp1.getPL(),_mleparse((List) test2result.getGenotype(0).getAnyAttribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY))), ""); } @Test private void testCalculatePosteriorHOM_VARtoHET() { VariantContext testOverlappingBase = makeVC("1", Arrays.asList(Aref,T), makeG("s1",T,T,40,1,0)); List supplTest1 = new ArrayList<>(1); - supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T))).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,500).attribute(VCFConstants.ALLELE_NUMBER_KEY,1000).make()); + supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T))).attribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY,500).attribute(VCFConstants.ALLELE_NUMBER_KEY,1000).make()); VariantContext test1result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(testOverlappingBase,supplTest1,0,0.001,true,false,false); - int[] GP = _mleparse( (List)test1result.getGenotype(0).getAnyAttribute(PHRED_SCALED_POSTERIORS_KEY)); + int[] GP = _mleparse( (List)test1result.getGenotype(0).getAnyAttribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY)); Assert.assertTrue(GP[2] > GP[1]); } @@ -231,10 +231,10 @@ public class PosteriorLikelihoodsUtilsUnitTest extends BaseTest { private void testCalculatePosteriorHETtoHOM_VAR() { VariantContext testOverlappingBase = makeVC("1", Arrays.asList(Aref,T), makeG("s1",T,T,40,0,1)); List supplTest1 = new ArrayList<>(1); - supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T))).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,900).attribute(VCFConstants.ALLELE_NUMBER_KEY,1000).make()); + supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T))).attribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY,900).attribute(VCFConstants.ALLELE_NUMBER_KEY,1000).make()); VariantContext test1result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(testOverlappingBase,supplTest1,0,0.001,true,false,false); - int[] GP = _mleparse( (List)test1result.getGenotype(0).getAnyAttribute(PHRED_SCALED_POSTERIORS_KEY)); + int[] GP = _mleparse( (List)test1result.getGenotype(0).getAnyAttribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY)); Assert.assertTrue(GP[2] < GP[1]); } @@ -242,10 +242,10 @@ public class PosteriorLikelihoodsUtilsUnitTest extends BaseTest { private void testCalculatePosteriorHOM_REFtoHET() { VariantContext testOverlappingBase = makeVC("1", Arrays.asList(Aref,T), makeG("s1",T,T,0,1,40)); List supplTest1 = new ArrayList<>(1); - supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T))).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,500).attribute(VCFConstants.ALLELE_NUMBER_KEY,1000).make()); + supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T))).attribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY,500).attribute(VCFConstants.ALLELE_NUMBER_KEY,1000).make()); VariantContext test1result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(testOverlappingBase,supplTest1,0,0.001,true,false,false); - int[] GP = _mleparse( (List)test1result.getGenotype(0).getAnyAttribute(PHRED_SCALED_POSTERIORS_KEY)); + int[] GP = _mleparse( (List)test1result.getGenotype(0).getAnyAttribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY)); Assert.assertTrue(GP[0] > GP[1]); } @@ -253,10 +253,10 @@ public class PosteriorLikelihoodsUtilsUnitTest extends BaseTest { private void testCalculatePosteriorHETtoHOM_REF() { VariantContext testOverlappingBase = makeVC("1", Arrays.asList(Aref,T), makeG("s1",T,T,1,0,40)); List supplTest1 = new ArrayList<>(1); - supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T))).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,100).attribute(VCFConstants.ALLELE_NUMBER_KEY,1000).make()); + supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T))).attribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY,100).attribute(VCFConstants.ALLELE_NUMBER_KEY,1000).make()); VariantContext test1result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(testOverlappingBase,supplTest1,0,0.001,true,false,false); - int[] GP = _mleparse( (List)test1result.getGenotype(0).getAnyAttribute(PHRED_SCALED_POSTERIORS_KEY)); + int[] GP = _mleparse( (List)test1result.getGenotype(0).getAnyAttribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY)); Assert.assertTrue(GP[0] < GP[1]); } @@ -266,7 +266,7 @@ public class PosteriorLikelihoodsUtilsUnitTest extends BaseTest { makeG("s2",Aref,T,18,0,24), makeG("s3",Aref,T,22,0,12)); List supplTest1 = new ArrayList<>(1); - supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T))).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,11).attribute(VCFConstants.ALLELE_NUMBER_KEY,10).make()); + supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T))).attribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY,11).attribute(VCFConstants.ALLELE_NUMBER_KEY,10).make()); VariantContext test1result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(testOverlappingBase,supplTest1,0,0.001,true,false,false); } @@ -287,7 +287,7 @@ public class PosteriorLikelihoodsUtilsUnitTest extends BaseTest { makeG("s2",Aref,T,18,0,24), makeG("s3",Aref,T,22,0,12)); List supplTest1 = new ArrayList<>(1); - supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T,C))).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,5).attribute(VCFConstants.ALLELE_NUMBER_KEY,10).make()); + supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T,C))).attribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY,5).attribute(VCFConstants.ALLELE_NUMBER_KEY,10).make()); VariantContext test1result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(testOverlappingBase,supplTest1,0,0.001,true,false,false); } @@ -307,7 +307,7 @@ public class PosteriorLikelihoodsUtilsUnitTest extends BaseTest { makeG("s2",Aref,T,18,0,24), makeG("s3",Aref,T,22,0,12)); List supplTest1 = new ArrayList<>(1); - supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T,C))).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,Arrays.asList(5,4)).attribute(VCFConstants.ALLELE_NUMBER_KEY,10).make()); + supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T,C))).attribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY,Arrays.asList(5,4)).attribute(VCFConstants.ALLELE_NUMBER_KEY,10).make()); VariantContext test1result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(testOverlappingBase,supplTest1,0,0.001,true,false,false); } @@ -317,11 +317,11 @@ public class PosteriorLikelihoodsUtilsUnitTest extends BaseTest { makeG("s2",Aref,ATC,18,0,24), makeG("s3",Aref,ATC,22,0,12)); List supplTest1 = new ArrayList<>(1); - supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T,C))).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,Arrays.asList(5,4)).attribute(VCFConstants.ALLELE_NUMBER_KEY,10).make()); + supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T,C))).attribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY,Arrays.asList(5,4)).attribute(VCFConstants.ALLELE_NUMBER_KEY,10).make()); VariantContext test1result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(inputIndel,supplTest1,0,0.001,true,false,false); System.out.println(test1result); - int[] GPs = _mleparse( (List)test1result.getGenotype(0).getAnyAttribute(PHRED_SCALED_POSTERIORS_KEY)); + int[] GPs = _mleparse( (List)test1result.getGenotype(0).getAnyAttribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY)); int[] PLs = test1result.getGenotype(0).getPL(); Assert.assertEquals(PLs,GPs); } @@ -332,12 +332,12 @@ public class PosteriorLikelihoodsUtilsUnitTest extends BaseTest { makeG("s2",Aref,T,18,0,24), makeG("s3",Aref,T,22,0,12)); List supplTest1 = new ArrayList<>(1); - supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,ATC,ATCATC))).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,Arrays.asList(5,4)).attribute(VCFConstants.ALLELE_NUMBER_KEY,10).make()); + supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,ATC,ATCATC))).attribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY,Arrays.asList(5,4)).attribute(VCFConstants.ALLELE_NUMBER_KEY,10).make()); VariantContext test1result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(inputIndel,supplTest1,0,0.001,true,false,false); System.out.println(test1result); - int[] GPs = _mleparse( (List)test1result.getGenotype(0).getAnyAttribute(PHRED_SCALED_POSTERIORS_KEY)); + int[] GPs = _mleparse( (List)test1result.getGenotype(0).getAnyAttribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY)); int[] PLs = test1result.getGenotype(0).getPL(); Assert.assertEquals(PLs,GPs); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java index ce7834a49..e0ce9405e 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -53,8 +53,10 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; import org.broadinstitute.gatk.engine.walkers.WalkerTest; import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.File; import java.util.Arrays; public class SelectVariantsIntegrationTest extends WalkerTest { @@ -62,6 +64,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest { return "-T SelectVariants -R " + b36KGReference + " -L 1 -o %s --no_cmdline_in_header" + args; } + private static final String sampleExclusionMD5 = "eea22fbf1e490e59389a663c3d6a6537"; + private static final String invertSelectionMD5 = "831bc0a5a723b0681a910d668ff3757b"; + @Test public void testDiscordanceNoSampleSpecified() { String testFile = privateTestDir + "NA12878.hg19.example1.vcf"; @@ -148,6 +153,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testNonExistingSelection--" + testfile, spec); } + /** + * Test excluding samples from file and sample name + */ @Test public void testSampleExclusionFromFileAndSeparateSample() { String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; @@ -160,9 +168,12 @@ public class SelectVariantsIntegrationTest extends WalkerTest { ); spec.disableShadowBCF(); - executeTest("testSampleExclusion--" + testfile, spec); + executeTest("testSampleExclusionFromFileAndSeparateSample--" + testfile, spec); } + /** + * Test excluding samples from file + */ @Test public void testSampleExclusionJustFromFile() { String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; @@ -175,9 +186,46 @@ public class SelectVariantsIntegrationTest extends WalkerTest { ); spec.disableShadowBCF(); - executeTest("testSampleExclusion--" + testfile, spec); + executeTest("testSampleExclusionJustFromFile--" + testfile, spec); } + /** + * Test excluding samples from expression + */ + @Test + public void testSampleExclusionJustFromExpression() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_se '[CDH]' --variant " + testfile, + 1, + Arrays.asList(sampleExclusionMD5) + ); + spec.disableShadowBCF(); + + executeTest("testSampleExclusionJustFromExpression--" + testfile, spec); + } + + /** + * Test excluding samples from negation expression + */ + @Test + public void testSampleExclusionJustFromNegationExpression() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -se '[^CDH]' --variant " + testfile, + 1, + Arrays.asList(sampleExclusionMD5) + ); + spec.disableShadowBCF(); + + executeTest("testSampleExclusionJustFromRegexExpression--" + testfile, spec); + } + + /** + * Test including samples that are not in the VCF + */ @Test public void testSampleInclusionWithNonexistingSamples() { String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; @@ -193,7 +241,6 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testSampleInclusionWithNonexistingSamples--" + testfile, spec); } - @Test public void testConcordance() { String testFile = privateTestDir + "NA12878.hg19.example1.vcf"; @@ -210,6 +257,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testConcordance--" + testFile, spec); } + /** + * Test including variant types. + */ @Test public void testVariantTypeSelection() { String testFile = privateTestDir + "complexExample1.vcf"; @@ -217,23 +267,42 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -restrictAllelesTo MULTIALLELIC -selectType MIXED --variant " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("ca2b70e3171420b08b0a2659bfe2a794") + Arrays.asList("2c50ab2ae96fae40bfc2b8398fc5e54e") ); executeTest("testVariantTypeSelection--" + testFile, spec); } + /** + * Test excluding indels that are larger than the specified size + */ @Test - public void testIndelLengthSelection() { + public void testMaxIndelLengthSelection() { String testFile = privateTestDir + "complexExample1.vcf"; WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b36KGReference + " -selectType INDEL --variant " + testFile + " -o %s --no_cmdline_in_header --maxIndelSize 3", + "-T SelectVariants -R " + b36KGReference + " -selectType INDEL --variant " + testFile + " -o %s --no_cmdline_in_header --maxIndelSize 2", 1, - Arrays.asList("004589868ca5dc887e2dff876b4cc797") + Arrays.asList("2c50ab2ae96fae40bfc2b8398fc5e54e") ); - executeTest("testIndelLengthSelection--" + testFile, spec); + executeTest("testMaxIndelLengthSelection--" + testFile, spec); + } + + /** + * Test excluding indels that are smaller than the specified size + */ + @Test + public void testMinIndelLengthSelection() { + String testFile = privateTestDir + "complexExample1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -selectType INDEL --variant " + testFile + " -o %s --no_cmdline_in_header --minIndelSize 2", + 1, + Arrays.asList("fa5f3eb4f0fc5cedc93e6c519c0c8bcb") + ); + + executeTest("testMinIndelLengthSelection--" + testFile, spec); } @Test @@ -280,7 +349,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String testFile = privateTestDir + "vcfexample.loseAlleleInSelection.vcf"; WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants --keepOriginalAC -env -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", + "-T SelectVariants --keepOriginalAC -env -trimAlternates -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", 1, Arrays.asList("4695c99d96490ed4e5b1568c5b52dea6") ); @@ -288,6 +357,19 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testKeepOriginalACAndENV--" + testFile, spec); } + @Test + public void testKeepOriginalDP() { + String testFile = privateTestDir + "CEUtrioTest.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants --keepOriginalDP -R " + b37KGReference + " -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("e897097a47aee5516dc4f1c0b9d69037") + ); + + executeTest("testKeepOriginalDP--" + testFile, spec); + } + @Test public void testMultipleRecordsAtOnePosition() { String testFile = privateTestDir + "selectVariants.onePosition.vcf"; @@ -319,7 +401,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String testfile = privateTestDir + "multi-allelic.bi-allelicInGIH.vcf"; String samplesFile = privateTestDir + "GIH.samples.list"; WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants --variant " + testfile, + "-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants -trimAlternates --variant " + testfile, 1, Arrays.asList("69862fb97e8e895fe65c7abb14b03cee") ); @@ -378,4 +460,209 @@ public class SelectVariantsIntegrationTest extends WalkerTest { UserException.class); executeTest("InvalidJexl", spec); } + + @Test + public void testAlleleTrimming() { + final String testFile = privateTestDir + "forHardLeftAlignVariantsTest.vcf"; + final String cmd = "-T SelectVariants -R " + b37KGReference + " -sn NA12878 -env -trimAlternates " + + "-V " + testFile + " -o %s --no_cmdline_in_header"; + WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("9df942000eb18b12d9008c7d9b5c4178")); + executeTest("testAlleleTrimming", spec); + } + + @DataProvider(name="unusedAlleleTrimmingProvider") + public Object[][] unusedAlleleTrimmingProvider() { + return new Object[][] { + { privateTestDir+"forHardLeftAlignVariantsTest.vcf", "-trimAlternates", "9df942000eb18b12d9008c7d9b5c4178"}, + { privateTestDir+"forHardLeftAlignVariantsTest.vcf", "", "981b757e3dc6bf3864ac7e493cf9d30d"}, + { privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT", "8ded359dd87fd498ff38736ea0fa4c28"}, + { privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT -env", "a7e7288dcd779cfac6983069de45b79c"}, + { privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT -trimAlternates", "2e726d06a8d317199e8dda74691948a3"}, + { privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT -env -trimAlternates", "1e5585f86c347da271a79fbfc61ac849"} + }; + } + + @Test(dataProvider = "unusedAlleleTrimmingProvider") + public void testUnusedAlleleTrimming(final String vcf, final String extraArgs, final String md5) { + final WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants" + + " -R "+b37KGReference + + " -V "+vcf + + " -o %s --no_cmdline_in_header" + + " "+extraArgs, + 1, + Arrays.asList(md5) + ); + executeTest(String.format("testUnusedAlleleTrimming: (%s,%s)", new File(vcf).getName(), extraArgs), spec); + } + + /** + * Test with an empty VCF file + */ + @Test + public void testEmptyVcfException(){ + String testfile = privateTestDir + "reallyEmpty.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants" + + " -R " + b36KGReference + + " -V " + testfile + + " -o %s --no_cmdline_in_header", + 1, + UserException.CommandLineException.class + ); + spec.disableShadowBCF(); + + executeTest("testEmptyVcfException--" + testfile, spec); + } + + /** + * Test with a VCF file that is not a file + */ + @Test + public void testNotFileVcfException(){ + String testfile = privateTestDir; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants" + + " -R " + b36KGReference + + " -V " + testfile + + " -o %s --no_cmdline_in_header", + 1, + UserException.CouldNotReadInputFile.class + ); + spec.disableShadowBCF(); + + executeTest("testNotFileVcfException--" + testfile, spec); + } + + /** + * Test with a VCF file that does not exist + */ + @Test + public void testMissingVcfException(){ + String testfile = "test.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants" + + " -R " + b36KGReference + + " -V " + testfile + + " -o %s --no_cmdline_in_header", + 1, + UserException.CouldNotReadInputFile.class + ); + spec.disableShadowBCF(); + + executeTest("testMissingVcfException--" + testfile, spec); + } + + /** + * Test inverting the variant selection criteria + */ + @Test + public void testInvertSelection() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 20000' --invert_selection --variant " + testfile), + 1, + Arrays.asList(invertSelectionMD5) + ); + spec.disableShadowBCF(); + executeTest("testInvertSelection--" + testfile, spec); + } + + /** + * Test inverting the variant JEXL selection criteria + */ + @Test + public void testInvertJexlSelection() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP >= 20000'--variant " + testfile), + 1, + Arrays.asList(invertSelectionMD5) + ); + spec.disableShadowBCF(); + executeTest("testInvertJexlSelection--" + testfile, spec); + } + + /** + * Test selecting variants with IDs + */ + @Test + public void testKeepSelectionID() { + String testFile = privateTestDir + "complexExample1.vcf"; + String idFile = privateTestDir + "complexExample1.vcf.id"; + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -IDs " + idFile + " --variant " + testFile), + 1, + Arrays.asList("2c50ab2ae96fae40bfc2b8398fc5e54e") + ); + spec.disableShadowBCF(); + executeTest("testKeepSelectionID--" + testFile, spec); + } + + /** + * Test excluding variants with IDs + */ + @Test + public void testExcludeSelectionID() { + String testFile = privateTestDir + "complexExample1.vcf"; + String idFile = privateTestDir + "complexExample1.vcf.id"; + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -xlIDs " + idFile + " --variant " + testFile), + 1, + Arrays.asList("77514a81233e1bbc0f5e47b0fb76a89a") + ); + spec.disableShadowBCF(); + executeTest("testExcludeSelectionID--" + testFile, spec); + } + + /** + * Test excluding variant types + */ + @Test + public void testExcludeSelectionType() { + String testFile = privateTestDir + "complexExample1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -xlSelectType SNP --variant " + testFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("fa5f3eb4f0fc5cedc93e6c519c0c8bcb") + ); + + executeTest("testExcludeSelectionType--" + testFile, spec); + } + + @Test + public void testMendelianViolationSelection() { + String testFile = privateTestDir + "CEUtrioTest.vcf"; + String pedFile = privateTestDir + "CEUtrio.ped"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R "+b37KGReference + " -mv -mvq 0 --variant " + testFile + " -ped " + pedFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("406243096074a417d2aa103bd3d13e01")); + + executeTest("testMendelianViolationSelection--" + testFile, spec); + } + + @Test + public void testInvertMendelianViolationSelection() { + String testFile = privateTestDir + "CEUtrioTest.vcf"; + String pedFile = privateTestDir + "CEUtrio.ped"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R "+b37KGReference + " -mv -mvq 0 -inv_mv --variant " + testFile + " -ped " + pedFile + " -o %s --no_cmdline_in_header", + 1, + Arrays.asList("35921fb2dedca0ead83027a66b725794")); + + executeTest("testMendelianViolationSelection--" + testFile, spec); + } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsParallelIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsParallelIntegrationTest.java index 1ae01f179..8116b6b11 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsParallelIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariantsParallelIntegrationTest.java @@ -99,8 +99,8 @@ public class SelectVariantsParallelIntegrationTest extends WalkerTest { } { // AD and PL decoding race condition final String testfile = privateTestDir + "race_condition.vcf"; - final String args = "-env -sn SAMPLE -L 1:1-10,000,000 -V " + testfile; - new ParallelSelectTestProvider(b37KGReference, args, "62e6156387d6e91bd2b08ef649cb1129", nt); + final String args = "-env -trimAlternates -sn SAMPLE -L 1:1-10,000,000 -V " + testfile; + new ParallelSelectTestProvider(b37KGReference, args, "e86c6eb105ecdd3ff026999ffc692821", nt); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java new file mode 100644 index 000000000..6290fdd96 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java @@ -0,0 +1,308 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.variantutils; + +import htsjdk.variant.variantcontext.*; +import org.broadinstitute.gatk.utils.*; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * Tests {@link org.broadinstitute.gatk.tools.walkers.variantutils.ReferenceConfidenceVariantContextMerger}. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class VariantContextMergerUnitTest extends BaseTest { + Allele Aref, T, C, G, Cref, ATC, ATCATC; + Allele ATCATCT; + Allele ATref; + Allele Anoref; + Allele GT; + Allele del; + + private GenomeLocParser genomeLocParser; + + @BeforeSuite + public void setup() throws IOException { + // alleles + Aref = Allele.create("A", true); + Cref = Allele.create("C", true); + T = Allele.create("T"); + C = Allele.create("C"); + G = Allele.create("G"); + ATC = Allele.create("ATC"); + ATCATC = Allele.create("ATCATC"); + ATCATCT = Allele.create("ATCATCT"); + ATref = Allele.create("AT",true); + Anoref = Allele.create("A",false); + del = GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE; + GT = Allele.create("GT",false); + genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(hg18Reference))); + } + + @Test(dataProvider = "referenceConfidenceMergeData") + public void testReferenceConfidenceMerge(final String testID, final List toMerge, final GenomeLoc loc, + final boolean returnSiteEvenIfMonomorphic, final boolean uniquifySamples, final VariantContext expectedResult) { + final VariantContext result = ReferenceConfidenceVariantContextMerger.merge(toMerge, loc, returnSiteEvenIfMonomorphic ? (byte) 'A' : null, true, uniquifySamples); + if ( result == null ) { + Assert.assertTrue(expectedResult == null); + return; + } + Assert.assertEquals(result.getAlleles(), expectedResult.getAlleles(),testID); + Assert.assertEquals(result.getNSamples(), expectedResult.getNSamples(),testID); + for ( final Genotype expectedGenotype : expectedResult.getGenotypes() ) { + Assert.assertTrue(result.hasGenotype(expectedGenotype.getSampleName()), "Missing " + expectedGenotype.getSampleName()); + // use string comparisons to test equality for now + Assert.assertEquals(result.getGenotype(expectedGenotype.getSampleName()).toString(), expectedGenotype.toString()); + } + } + + @Test + public void testGenerateADWithNewAlleles() { + + final int[] originalAD = new int[] {1,2,0}; + final int[] indexesOfRelevantAlleles = new int[] {0,1,2,2}; + + final int[] newAD = ReferenceConfidenceVariantContextMerger.generateAD(originalAD, indexesOfRelevantAlleles); + Assert.assertEquals(newAD, new int[]{1,2,0,0}); + } + + + @Test(expectedExceptions = UserException.class) + public void testGetIndexesOfRelevantAllelesWithNoALT() { + + final List alleles1 = new ArrayList<>(1); + alleles1.add(Allele.create("A", true)); + final List alleles2 = new ArrayList<>(1); + alleles2.add(Allele.create("A", true)); + GenotypeBuilder builder = new GenotypeBuilder(); + ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(alleles1, alleles2, -1, builder.make()); + Assert.fail("We should have thrown an exception because the allele was not present"); + } + + @Test(dataProvider = "getIndexesOfRelevantAllelesData") + public void testGetIndexesOfRelevantAlleles(final int allelesIndex, final List allAlleles) { + final List myAlleles = new ArrayList<>(3); + + // always add the reference and alleles + myAlleles.add(allAlleles.get(0)); + myAlleles.add(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); + // optionally add another alternate allele + if ( allelesIndex > 0 ) + myAlleles.add(allAlleles.get(allelesIndex)); + + GenotypeBuilder builder = new GenotypeBuilder(); + + final int[] indexes = ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(myAlleles, allAlleles, -1, builder.make()); + + Assert.assertEquals(indexes.length, allAlleles.size()); + + for ( int i = 0; i < allAlleles.size(); i++ ) { + if ( i == 0 ) + Assert.assertEquals(indexes[i], 0); // ref should always match + else if ( i == allelesIndex ) + Assert.assertEquals(indexes[i], 2); // allele + else + Assert.assertEquals(indexes[i], 1); // + } + } + + + @DataProvider(name = "referenceConfidenceMergeData") + public Object[][] makeReferenceConfidenceMergeData() { + final List tests = new ArrayList<>(); + final int start = 10; + final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, start, start); + final VariantContext VCbase = new VariantContextBuilder("test", "20", start, start, Arrays.asList(Aref)).make(); + final VariantContext VCbase2 = new VariantContextBuilder("test2", "20", start, start, Arrays.asList(Aref)).make(); + final VariantContext VCprevBase = new VariantContextBuilder("test", "20", start-1, start-1, Arrays.asList(Aref)).make(); + + final int[] standardPLs = new int[]{30, 20, 10, 71, 72, 73}; + final int[] reorderedSecondAllelePLs = new int[]{30, 71, 73, 20, 72, 10}; + + final List noCalls = new ArrayList<>(2); + noCalls.add(Allele.NO_CALL); + noCalls.add(Allele.NO_CALL); + + final List A_ALT = Arrays.asList(Aref, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_ALT = new GenotypeBuilder("A").PL(new int[]{0, 100, 1000}).alleles(noCalls).make(); + final VariantContext vcA_ALT = new VariantContextBuilder(VCbase).alleles(A_ALT).genotypes(gA_ALT).make(); + + final Allele AAref = Allele.create("AA", true); + final List AA_ALT = Arrays.asList(AAref, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); + final Genotype gAA_ALT = new GenotypeBuilder("AA").PL(new int[]{0, 80, 800}).alleles(noCalls).make(); + final VariantContext vcAA_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_ALT).genotypes(gAA_ALT).make(); + + final List A_C = Arrays.asList(Aref, C); + final Genotype gA_C = new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10}).alleles(noCalls).make(); + final List A_C_ALT = Arrays.asList(Aref, C, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_C_ALT = new GenotypeBuilder("A_C").PL(standardPLs).alleles(noCalls).make(); + final VariantContext vcA_C = new VariantContextBuilder(VCbase2).alleles(A_C_ALT).genotypes(gA_C).make(); + final VariantContext vcA_C_ALT = new VariantContextBuilder(VCbase).alleles(A_C_ALT).genotypes(gA_C_ALT).make(); + + final List A_G_ALT = Arrays.asList(Aref, G, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_G_ALT = new GenotypeBuilder("A_G").PL(standardPLs).alleles(noCalls).make(); + final VariantContext vcA_G_ALT = new VariantContextBuilder(VCbase).alleles(A_G_ALT).genotypes(gA_G_ALT).make(); + + final List A_C_G = Arrays.asList(Aref, C, G); + final Genotype gA_C_G = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30}).alleles(noCalls).make(); + final List A_C_G_ALT = Arrays.asList(Aref, C, G, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_C_G_ALT = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30, 71, 72, 73, 74}).alleles(noCalls).make(); + final VariantContext vcA_C_G = new VariantContextBuilder(VCbase2).alleles(A_C_G_ALT).genotypes(gA_C_G).make(); + final VariantContext vcA_C_G_ALT = new VariantContextBuilder(VCbase).alleles(A_C_G_ALT).genotypes(gA_C_G_ALT).make(); + + final List A_ATC_ALT = Arrays.asList(Aref, ATC, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_ATC_ALT = new GenotypeBuilder("A_ATC").PL(standardPLs).alleles(noCalls).make(); + final VariantContext vcA_ATC_ALT = new VariantContextBuilder(VCbase).alleles(A_ATC_ALT).genotypes(gA_ATC_ALT).make(); + + final Allele A = Allele.create("A", false); + final List AA_A_ALT = Arrays.asList(AAref, A, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); + final Genotype gAA_A_ALT = new GenotypeBuilder("AA_A").PL(standardPLs).alleles(noCalls).make(); + final VariantContext vcAA_A_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_A_ALT).genotypes(gAA_A_ALT).make(); + final List A_C_del = Arrays.asList(Aref, C, del); + + // first test the case of a single record + tests.add(new Object[]{"test00",Arrays.asList(vcA_C_ALT), + loc, false, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C).make()}); + + // now, test pairs: + // a SNP with another SNP + tests.add(new Object[]{"test01",Arrays.asList(vcA_C_ALT, vcA_G_ALT), + loc, false, false, + new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, new GenotypeBuilder("A_G").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()}); + // a SNP with an indel + tests.add(new Object[]{"test02",Arrays.asList(vcA_C_ALT, vcA_ATC_ALT), + loc, false, false, + new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC)).genotypes(gA_C_ALT, new GenotypeBuilder("A_ATC").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()}); + // a SNP with 2 SNPs + tests.add(new Object[]{"test03",Arrays.asList(vcA_C_ALT, vcA_C_G_ALT), + loc, false, false, + new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, gA_C_G).make()}); + // a SNP with a ref record + tests.add(new Object[]{"test04",Arrays.asList(vcA_C_ALT, vcA_ALT), + loc, false, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gA_ALT).make()}); + + // spanning records: + // a SNP with a spanning ref record + tests.add(new Object[]{"test05",Arrays.asList(vcA_C_ALT, vcAA_ALT), + loc, false, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gAA_ALT).make()}); + // a SNP with a spanning deletion + tests.add(new Object[]{"test06",Arrays.asList(vcA_C_ALT, vcAA_A_ALT), + loc, false, false, + new VariantContextBuilder(VCbase).alleles(A_C_del).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73}).alleles(noCalls).make(), + new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 20, 72, 10}).alleles(noCalls).make()).make()}); + + // combination of all + tests.add(new Object[]{"test07",Arrays.asList(vcA_C_ALT, vcA_G_ALT, vcA_ATC_ALT, vcA_C_G_ALT, vcA_ALT, vcAA_ALT, vcAA_A_ALT), + loc, false, false, + new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, G, ATC, del)).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73, 71, 72, 73, 73, 71, 72, 73, 73, 73}).alleles(noCalls).make(), + new GenotypeBuilder("A_G").PL(new int[]{30, 71, 73, 20, 72, 10, 71, 73, 72, 73, 71, 73, 72, 73, 73}).alleles(noCalls).make(), + new GenotypeBuilder("A_ATC").PL(new int[]{30, 71, 73, 71, 73, 73, 20, 72, 72, 10, 71, 73, 73, 72, 73}).alleles(noCalls).make(), + new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30, 71, 72, 73, 74, 71, 72, 73, 74, 74}).alleles(noCalls).make(), + new GenotypeBuilder("A").PL(new int[]{0, 100, 1000, 100, 1000, 1000, 100, 1000, 1000, 1000, 100, 1000, 1000, 1000, 1000}).alleles(noCalls).make(), + new GenotypeBuilder("AA").PL(new int[]{0, 80, 800, 80, 800, 800, 80, 800, 800, 800, 80, 800, 800, 800, 800}).alleles(noCalls).make(), + new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 71, 73, 73, 71, 73, 73, 73, 20, 72, 72, 72, 10}).alleles(noCalls).make()).make()}); + + // just spanning ref contexts, trying both instances where we want/do not want ref-only contexts + tests.add(new Object[]{"test08",Arrays.asList(vcAA_ALT), + + loc, false, false, + null}); + tests.add(new Object[]{"test09", Arrays.asList(vcAA_ALT), + loc, true, false, + new VariantContextBuilder(VCbase).alleles(Arrays.asList(Allele.create("A", true))).genotypes(new GenotypeBuilder("AA").PL(new int[]{0}).alleles(noCalls).make()).make()}); + + // test uniquification of sample names + tests.add(new Object[]{"test10",Arrays.asList(vcA_C, vcA_C_ALT), loc, false, true, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes( + new GenotypeBuilder("A_C.test2").PL(new int[]{30, 20, 10}).alleles(noCalls).make(), + new GenotypeBuilder("A_C.test").PL(new int[]{30, 20, 10}).alleles(noCalls).make()).make()}); + + tests.add(new Object[]{"test11",Arrays.asList(vcA_C_G, vcA_C_G_ALT), loc, false, true, + new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes( + new GenotypeBuilder("A_C_G.test2").PL(new int[]{40, 20, 30, 20, 10, 30}).alleles(noCalls).make(), + new GenotypeBuilder("A_C_G.test").PL(new int[]{40, 20, 30, 20, 10, 30}).alleles(noCalls).make()).make()}); + + final Object[][] result = tests.toArray(new Object[][]{}); + return result; + } + @DataProvider(name = "getIndexesOfRelevantAllelesData") + public Object[][] makeGetIndexesOfRelevantAllelesData() { + final int totalAlleles = 5; + final List alleles = new ArrayList<>(totalAlleles); + alleles.add(Allele.create("A", true)); + for ( int i = 1; i < totalAlleles; i++ ) + alleles.add(Allele.create(Utils.dupString('A', i + 1), false)); + + final List tests = new ArrayList<>(); + + for ( int alleleIndex = 0; alleleIndex < totalAlleles; alleleIndex++ ) { + tests.add(new Object[]{alleleIndex, alleles}); + } + + return tests.toArray(new Object[][]{}); + } +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java index 216d43a21..387c26021 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java @@ -92,7 +92,9 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.fam",10), 3, - Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","daf853d2fcbfc77daa1f9ae190be24f4","02f1c462ebc8576e399d0e94f729fd95") + Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8", + "98f955f8b2af0aef58c96cc0cde66662", + "02f1c462ebc8576e399d0e94f729fd95") ); executeTest(testName, spec); @@ -182,7 +184,9 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("CEUTrio.subset.vcf", "CEUTrio.fam",10), 3, - Arrays.asList("59b93fbb4bb31309b3adc83ba96dd1a2","3bfb01c17935e3d194d266755b446e82","7887d2e0bf605dbcd0688c552cdb99d5") + Arrays.asList("59b93fbb4bb31309b3adc83ba96dd1a2", + "cbef0432e78f0a31a3cb4cd1942ada62", + "7887d2e0bf605dbcd0688c552cdb99d5") ); executeTest(testName, spec); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToTableIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToTableIntegrationTest.java index ebda9e23e..fd7cf737a 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToTableIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToTableIntegrationTest.java @@ -114,6 +114,21 @@ public class VariantsToTableIntegrationTest extends WalkerTest { executeTest("testGenotypeFields", spec); } + @Test + public void testMultiallelicGenotypeFields() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b37KGReference + + " --variant " + privateTestDir + "multiallelic_gt.vcf" + + " -T VariantsToTable" + + " -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F MULTI-ALLELIC" + + " -GF PL -GF AD" + + " -SMA" + + " -o %s", + 1, + Arrays.asList("7d38e7adb07eee94405188d145f22bb5")); + executeTest("testMultiallelicGenotypeFields", spec); + } + @Test(enabled = true) public void testGenotypeFieldsWithInline() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/collections/IndexedSetUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/collections/IndexedSetUnitTest.java index 56a07058f..2dafcb70d 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/collections/IndexedSetUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/collections/IndexedSetUnitTest.java @@ -51,7 +51,7 @@ package org.broadinstitute.gatk.utils.collections; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.Utils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -68,7 +68,7 @@ public class IndexedSetUnitTest { @Test(dataProvider = "initialCapacityElementCountMaxElementData") public void testCompositionBySingleElementAddition(final int initialCapacity, final int elementCount, final int maxElement) { - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); final IndexedSet subject = new IndexedSet<>(initialCapacity); final Set elementSet = new LinkedHashSet<>(); @@ -111,7 +111,7 @@ public class IndexedSetUnitTest { } private List generateElementCollection(final int elementCount, final int maxElement) { - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); final List elementList = new ArrayList<>(elementCount); for (int i = 0; i < elementCount; i++) @@ -163,7 +163,7 @@ public class IndexedSetUnitTest { final IndexedSet subject = new IndexedSet<>(elementList); final Set elementSet = new LinkedHashSet<>(elementList); final int removeCount = (subject.size() + 1) / 2; - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); for (int i = 0; i < removeCount; i++) { final int removeIndex = rnd.nextInt(subject.size()); final int removeElement = subject.get(removeIndex); @@ -181,7 +181,7 @@ public class IndexedSetUnitTest { final IndexedSet subject = new IndexedSet<>(elementList); final Set elementSet = new LinkedHashSet<>(elementList); final int removeCount = subject.size(); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); for (int i = 0; i < removeCount; i++) { final int removeIndex = rnd.nextInt(subject.size()); final int removeElement = subject.get(removeIndex); @@ -211,7 +211,7 @@ public class IndexedSetUnitTest { final IndexedSet subject = new IndexedSet<>(elementList); final Set elementSet = new LinkedHashSet<>(elementList); final int removeCount = subject.size(); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); for (int i = 0; i < removeCount; i++) { final int removeIndex = rnd.nextInt(subject.size()); final int removeElement = subject.get(removeIndex); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/collections/IntMaxHeapUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/collections/IntMaxHeapUnitTest.java index 03f19491f..aeab35ad4 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/collections/IntMaxHeapUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/collections/IntMaxHeapUnitTest.java @@ -51,7 +51,7 @@ package org.broadinstitute.gatk.utils.collections; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.Utils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -73,7 +73,7 @@ public class IntMaxHeapUnitTest { final IntMaxHeap heap = new IntMaxHeap(initialCapacity); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); for (int i = 0; i < elementCount; i++) { final int v = rnd.nextInt(); @@ -85,7 +85,7 @@ public class IntMaxHeapUnitTest { public void testEmptynessAndSize(final int initialCapacity, final int elementCount) { final IntMaxHeap heap = new IntMaxHeap(initialCapacity); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); Assert.assertEquals(heap.size(),0); Assert.assertTrue(heap.isEmpty()); @@ -101,7 +101,7 @@ public class IntMaxHeapUnitTest { public void testClear(final int initialCapacity, final int elementCount) { final IntMaxHeap heap = new IntMaxHeap(initialCapacity); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); for (int i = 0; i < elementCount; i++) { final int v = rnd.nextInt(); @@ -118,7 +118,7 @@ public class IntMaxHeapUnitTest { final IntMaxHeap addHeap = new IntMaxHeap(initialCapacity); final IntMaxHeap arrayAddHeap = new IntMaxHeap(initialCapacity); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); final int[] values = new int[elementCount]; for (int i = 0; i < elementCount; i++) { final int v = rnd.nextInt(); @@ -135,7 +135,7 @@ public class IntMaxHeapUnitTest { public void testRemove(final int initialCapacity, final int elementCount) { final IntMaxHeap heap = new IntMaxHeap(initialCapacity); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); final List values = new ArrayList<>(elementCount); for (int i = 0; i < elementCount; i++) { final int v = rnd.nextInt(); @@ -154,7 +154,7 @@ public class IntMaxHeapUnitTest { public void testPeek(final int initialCapacity, final int elementCount) { final IntMaxHeap heap = new IntMaxHeap(initialCapacity); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); int top = rnd.nextInt(); heap.add(top); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DiffEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DiffEngineUnitTest.java new file mode 100644 index 000000000..e00ac7e8e --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DiffEngineUnitTest.java @@ -0,0 +1,259 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.utils.diffengine; + + +// the imports for unit testing. + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.diffengine.DiffElement; +import org.broadinstitute.gatk.utils.diffengine.DiffEngine; +import org.broadinstitute.gatk.utils.diffengine.DiffNode; +import org.broadinstitute.gatk.utils.diffengine.Difference; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Basic unit test for DifferableReaders in reduced reads + */ +public class DiffEngineUnitTest extends BaseTest { + DiffEngine engine; + + @BeforeClass(enabled = true) + public void createDiffEngine() { + engine = new DiffEngine(); + } + + // -------------------------------------------------------------------------------- + // + // Difference testing routines + // + // -------------------------------------------------------------------------------- + + private class DifferenceTest extends TestDataProvider { + public DiffElement tree1, tree2; + public List differences; + + private DifferenceTest(String tree1, String tree2) { + this(tree1, tree2, Collections.emptyList()); + } + + private DifferenceTest(String tree1, String tree2, String difference) { + this(tree1, tree2, Arrays.asList(difference)); + } + + private DifferenceTest(String tree1, String tree2, List differences) { + super(DifferenceTest.class); + this.tree1 = DiffNode.fromString(tree1); + this.tree2 = DiffNode.fromString(tree2); + this.differences = differences; + } + + public String toString() { + return String.format("tree1=%s tree2=%s diff=%s", + tree1.toOneLineString(), tree2.toOneLineString(), differences); + } + } + + @DataProvider(name = "trees") + public Object[][] createTrees() { + new DifferenceTest("A=X", "A=X"); + new DifferenceTest("A=X", "A=Y", "A:X!=Y"); + new DifferenceTest("A=X", "B=X", Arrays.asList("A:X!=MISSING", "B:MISSING!=X")); + new DifferenceTest("A=(X=1)", "B=(X=1)", Arrays.asList("A:(X=1)!=MISSING", "B:MISSING!=(X=1)")); + new DifferenceTest("A=(X=1)", "A=(X=1)"); + new DifferenceTest("A=(X=1 Y=2)", "A=(X=1 Y=2)"); + new DifferenceTest("A=(X=1 Y=2 B=(Z=3))", "A=(X=1 Y=2 B=(Z=3))"); + new DifferenceTest("A=(X=1)", "A=(X=2)", "A.X:1!=2"); + new DifferenceTest("A=(X=1 Y=2 B=(Z=3))", "A=(X=1 Y=2 B=(Z=4))", "A.B.Z:3!=4"); + new DifferenceTest("A=(X=1)", "A=(X=1 Y=2)", "A.Y:MISSING!=2"); + new DifferenceTest("A=(X=1 Y=2 B=(Z=3))", "A=(X=1 Y=2)", "A.B:(Z=3)!=MISSING"); + return DifferenceTest.getTests(DifferenceTest.class); + } + + @Test(enabled = true, dataProvider = "trees") + public void testDiffs(DifferenceTest test) { + logger.warn("Test tree1: " + test.tree1.toOneLineString()); + logger.warn("Test tree2: " + test.tree2.toOneLineString()); + + List diffs = engine.diff(test.tree1, test.tree2); + logger.warn("Test expected diff : " + test.differences); + logger.warn("Observed diffs : " + diffs); + } + + // -------------------------------------------------------------------------------- + // + // Low-level routines for summarizing differences + // + // -------------------------------------------------------------------------------- + + @Test(enabled = true) + public void testLongestCommonPostfix() { + testLongestCommonPostfixHelper("A", "A", 1); + testLongestCommonPostfixHelper("A", "B", 0); + testLongestCommonPostfixHelper("A.B", "A.B", 2); + testLongestCommonPostfixHelper("A.B.C", "A.B.C", 3); + testLongestCommonPostfixHelper("A.B.C", "X.B.C", 2); + testLongestCommonPostfixHelper("A.B.C", "X.Y.C", 1); + testLongestCommonPostfixHelper("A.B.C", "X.Y.Z", 0); + testLongestCommonPostfixHelper("A.B.C", "A.X.C", 1); + testLongestCommonPostfixHelper("A.B.C", "A.X.Z", 0); + testLongestCommonPostfixHelper("A.B.C", "A.B.Z", 0); + } + + public void testLongestCommonPostfixHelper(String p1, String p2, int expected) { + String[] parts1 = p1.split("\\."); + String[] parts2 = p2.split("\\."); + int obs = DiffEngine.longestCommonPostfix(parts1, parts2); + Assert.assertEquals(obs, expected, "p1=" + p1 + " p2=" + p2 + " failed"); + } + + @Test(enabled = true, dependsOnMethods = "testLongestCommonPostfix") + public void testSummarizePath() { + testSummarizePathHelper("A", "A", "A"); + testSummarizePathHelper("A", "B", "*"); + testSummarizePathHelper("A.B", "A.B", "A.B"); + testSummarizePathHelper("A.B", "X.B", "*.B"); + testSummarizePathHelper("A.B", "X.Y", "*.*"); + testSummarizePathHelper("A.B.C", "A.B.C", "A.B.C"); + testSummarizePathHelper("A.B.C", "X.B.C", "*.B.C"); + testSummarizePathHelper("A.B.C", "X.Y.C", "*.*.C"); + testSummarizePathHelper("A.B.C", "X.Y.Z", "*.*.*"); + testSummarizePathHelper("A.B.C", "A.X.C", "*.*.C"); + testSummarizePathHelper("A.B.C", "A.X.Z", "*.*.*"); + testSummarizePathHelper("A.B.C", "A.B.Z", "*.*.*"); + } + + public void testSummarizePathHelper(String p1, String p2, String expected) { + String[] parts1 = DiffEngine.diffNameToPath(p1); + String[] parts2 = DiffEngine.diffNameToPath(p2); + int obs = DiffEngine.longestCommonPostfix(parts1, parts2); + String path = DiffEngine.summarizedPath(parts2, obs); + Assert.assertEquals(path, expected, "p1=" + p1 + " p2=" + p2 + " failed"); + } + + // -------------------------------------------------------------------------------- + // + // High-level difference summary + // + // -------------------------------------------------------------------------------- + + private class SummarizeDifferenceTest extends TestDataProvider { + List diffs = new ArrayList(); + List expecteds = new ArrayList(); + + public SummarizeDifferenceTest() { super(SummarizeDifferenceTest.class); } + + public SummarizeDifferenceTest addDiff(String... diffsToAdd) { + diffs.addAll(Arrays.asList(diffsToAdd)); + return this; + } + + public SummarizeDifferenceTest addSummary(String... expectedSummary) { + expecteds.addAll(Arrays.asList(expectedSummary)); + return this; + } + + public String toString() { + return String.format("diffs=%s => expected=%s", diffs, expecteds); + } + + public void test() { + List diffPaths = new ArrayList(diffs.size()); + for ( String diff : diffs ) { diffPaths.add(DiffEngine.diffNameToPath(diff)); } + + List sumDiffs = engine.summarizedDifferencesOfPathsFromString(diffs); + + Assert.assertEquals(sumDiffs.size(), expecteds.size(), "Unexpected number of summarized differences: " + sumDiffs); + + for ( int i = 0; i < sumDiffs.size(); i++ ) { + Difference sumDiff = sumDiffs.get(i); + String expected = expecteds.get(i); + String[] pathCount = expected.split(":"); + String path = pathCount[0]; + int count = Integer.valueOf(pathCount[1]); + Assert.assertEquals(sumDiff.getPath(), path, "Unexpected path at: " + expected + " obs=" + sumDiff + " all=" + sumDiffs); + Assert.assertEquals(sumDiff.getCount(), count, "Unexpected counts at: " + expected + " obs=" + sumDiff + " all=" + sumDiffs); + } + } + } + + @DataProvider(name = "summaries") + public Object[][] createSummaries() { + new SummarizeDifferenceTest().addDiff("A", "A").addSummary("A:2"); + new SummarizeDifferenceTest().addDiff("A", "B").addSummary("A:1", "B:1"); + new SummarizeDifferenceTest().addDiff("A", "A", "A").addSummary("A:3"); + new SummarizeDifferenceTest().addDiff("A", "A", "A", "B").addSummary("A:3", "B:1"); + new SummarizeDifferenceTest().addDiff("A", "A", "A", "B", "B").addSummary("A:3", "B:2"); + new SummarizeDifferenceTest().addDiff("A", "A", "A", "B", "B", "C").addSummary("A:3", "B:2", "C:1"); + new SummarizeDifferenceTest().addDiff("A.X", "A.X").addSummary("A.X:2"); + new SummarizeDifferenceTest().addDiff("A.X", "A.X", "B.X").addSummary("*.X:3", "A.X:2", "B.X:1"); + new SummarizeDifferenceTest().addDiff("A.X", "A.X", "B.X", "B.X").addSummary("*.X:4", "A.X:2", "B.X:2"); + new SummarizeDifferenceTest().addDiff("A.B.C", "X.B.C").addSummary("*.B.C:2", "A.B.C:1", "X.B.C:1"); + new SummarizeDifferenceTest().addDiff("A.B.C", "X.Y.C", "X.Y.C").addSummary("*.*.C:3", "X.Y.C:2", "A.B.C:1"); + new SummarizeDifferenceTest().addDiff("A.B.C", "A.X.C", "X.Y.C").addSummary("*.*.C:3", "A.B.C:1", "A.X.C:1", "X.Y.C:1"); + new SummarizeDifferenceTest().addDiff("A.B.C", "A.X.C", "B.X.C").addSummary("*.*.C:3", "*.X.C:2", "A.B.C:1", "A.X.C:1", "B.X.C:1"); + new SummarizeDifferenceTest().addDiff("A.B.C", "A.X.C", "B.X.C", "B.X.C").addSummary("*.*.C:4", "*.X.C:3", "B.X.C:2", "A.B.C:1", "A.X.C:1"); + + return SummarizeDifferenceTest.getTests(SummarizeDifferenceTest.class); + } + + + @Test(enabled = true, dependsOnMethods = "testSummarizePath", dataProvider = "summaries") + public void testSummarizeDifferences(SummarizeDifferenceTest test) { + test.test(); + } +} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DiffNodeUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DiffNodeUnitTest.java new file mode 100644 index 000000000..38252223a --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DiffNodeUnitTest.java @@ -0,0 +1,278 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.utils.diffengine; + + +// the imports for unit testing. + + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.diffengine.DiffElement; +import org.broadinstitute.gatk.utils.diffengine.DiffNode; +import org.broadinstitute.gatk.utils.diffengine.DiffValue; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Basic unit test for DifferableReaders in reduced reads + */ +public class DiffNodeUnitTest extends BaseTest { + // Data is: + // MY_ROOT + // fields: A=A, B=B + // nodes: C, D + // C: fields: E=E, nodes: none + // D: fields: F=F, G=G, nodes: none + static DiffNode MY_ROOT = DiffNode.rooted("MY_ROOT"); + static DiffValue Value_A = new DiffValue("A", MY_ROOT, "A"); + static DiffValue Value_B = new DiffValue("B", MY_ROOT, "B"); + static DiffNode NODE_C = DiffNode.empty("C", MY_ROOT); + static DiffNode NODE_D = DiffNode.empty("D", MY_ROOT); + static DiffValue Value_E = new DiffValue("E", NODE_C, "E"); + static DiffValue Value_F = new DiffValue("F", NODE_D, "F"); + static DiffValue Value_G = new DiffValue("G", NODE_D, "G"); + + static { + MY_ROOT.add(Value_A); + MY_ROOT.add(Value_B); + MY_ROOT.add(NODE_C); + MY_ROOT.add(NODE_D); + NODE_C.add(Value_E); + NODE_D.add(Value_F); + NODE_D.add(Value_G); + } + + + // -------------------------------------------------------------------------------- + // + // Element testing routines + // + // -------------------------------------------------------------------------------- + + private class ElementTest extends TestDataProvider { + public DiffElement elt; + public String name; + public String fullName; + public DiffElement parent; + + private ElementTest(DiffValue elt, DiffValue parent, String name, String fullName) { + this(elt.getBinding(), parent.getBinding(), name, fullName); + } + + private ElementTest(DiffElement elt, DiffElement parent, String name, String fullName) { + super(ElementTest.class); + this.elt = elt; + this.name = name; + this.fullName = fullName; + this.parent = parent; + } + + public String toString() { + return String.format("ElementTest elt=%s name=%s fullName=%s parent=%s", + elt.toOneLineString(), name, fullName, parent.getName()); + } + } + + @DataProvider(name = "elementdata") + public Object[][] createElementData() { + new ElementTest(MY_ROOT.getBinding(), DiffElement.ROOT, "MY_ROOT", "MY_ROOT"); + new ElementTest(NODE_C, MY_ROOT, "C", "MY_ROOT.C"); + new ElementTest(NODE_D, MY_ROOT, "D", "MY_ROOT.D"); + new ElementTest(Value_A, MY_ROOT, "A", "MY_ROOT.A"); + new ElementTest(Value_B, MY_ROOT, "B", "MY_ROOT.B"); + new ElementTest(Value_E, NODE_C, "E", "MY_ROOT.C.E"); + new ElementTest(Value_F, NODE_D, "F", "MY_ROOT.D.F"); + new ElementTest(Value_G, NODE_D, "G", "MY_ROOT.D.G"); + return TestDataProvider.getTests(ElementTest.class); + } + + @Test(enabled = true, dataProvider = "elementdata") + public void testElementMethods(ElementTest test) { + Assert.assertNotNull(test.elt.getName()); + Assert.assertNotNull(test.elt.getParent()); + Assert.assertEquals(test.elt.getName(), test.name); + Assert.assertEquals(test.elt.getParent(), test.parent); + Assert.assertEquals(test.elt.fullyQualifiedName(), test.fullName); + } + + // -------------------------------------------------------------------------------- + // + // DiffValue testing routines + // + // -------------------------------------------------------------------------------- + + private class LeafTest extends TestDataProvider { + public DiffValue diffvalue; + public Object value; + + private LeafTest(DiffValue diffvalue, Object value) { + super(LeafTest.class); + this.diffvalue = diffvalue; + this.value = value; + } + + public String toString() { + return String.format("LeafTest diffvalue=%s value=%s", diffvalue.toOneLineString(), value); + } + } + + @DataProvider(name = "leafdata") + public Object[][] createLeafData() { + new LeafTest(Value_A, "A"); + new LeafTest(Value_B, "B"); + new LeafTest(Value_E, "E"); + new LeafTest(Value_F, "F"); + new LeafTest(Value_G, "G"); + return TestDataProvider.getTests(LeafTest.class); + } + + @Test(enabled = true, dataProvider = "leafdata") + public void testLeafMethods(LeafTest test) { + Assert.assertNotNull(test.diffvalue.getValue()); + Assert.assertEquals(test.diffvalue.getValue(), test.value); + } + + // -------------------------------------------------------------------------------- + // + // Node testing routines + // + // -------------------------------------------------------------------------------- + + private class NodeTest extends TestDataProvider { + public DiffNode node; + public Set fields; + public Set subnodes; + public Set allNames; + + private NodeTest(DiffNode node, List fields, List subnodes) { + super(NodeTest.class); + this.node = node; + this.fields = new HashSet(fields); + this.subnodes = new HashSet(subnodes); + this.allNames = new HashSet(fields); + allNames.addAll(subnodes); + } + + public String toString() { + return String.format("NodeTest node=%s fields=%s subnodes=%s", + node.toOneLineString(), fields, subnodes); + } + } + + @DataProvider(name = "nodedata") + public Object[][] createData1() { + new NodeTest(MY_ROOT, Arrays.asList("A", "B"), Arrays.asList("C", "D")); + new NodeTest(NODE_C, Arrays.asList("E"), Collections.emptyList()); + new NodeTest(NODE_D, Arrays.asList("F", "G"), Collections.emptyList()); + return TestDataProvider.getTests(NodeTest.class); + } + + @Test(enabled = true, dataProvider = "nodedata") + public void testNodeAccessors(NodeTest test) { + Assert.assertNotNull(test.node.getElements()); + + for ( String name : test.allNames ) { + DiffElement elt = test.node.getElement(name); + Assert.assertNotNull(elt, "Failed to find field " + elt + " in " + test.node); + Assert.assertEquals(elt.getName(), name); + Assert.assertEquals(elt.getValue().isAtomic(), test.fields.contains(name), "Failed atomic/compound expectation: " + test.node); + } + } + + // NOTE: add routines are being implicitly tested by the creation of the data structures + + @Test(enabled = true, dataProvider = "nodedata") + public void testCounts(NodeTest test) { + Assert.assertEquals(test.node.getElements().size(), test.allNames.size()); + Assert.assertEquals(test.node.getElementNames(), test.allNames); + } + + // -------------------------------------------------------------------------------- + // + // fromString testing routines + // + // -------------------------------------------------------------------------------- + + private class FromStringTest extends TestDataProvider { + public String string; + public DiffElement expected; + + private FromStringTest(String string, DiffElement expected) { + super(FromStringTest.class); + this.string = string; + this.expected = expected; + } + + public String toString() { + return String.format("FromStringTest string=%s expected=%s", string, expected.toOneLineString()); + } + } + + @DataProvider(name = "fromstringdata") + public Object[][] createFromData() { + new FromStringTest("A=A", Value_A.getBinding()); + new FromStringTest("B=B", Value_B.getBinding()); + new FromStringTest("C=(E=E)", NODE_C.getBinding()); + new FromStringTest("D=(F=F G=G)", NODE_D.getBinding()); + return TestDataProvider.getTests(FromStringTest.class); + } + + @Test(enabled = true, dataProvider = "fromstringdata") + public void parseFromString(FromStringTest test) { + logger.warn("Testing from string: " + test.string); + DiffElement elt = DiffNode.fromString(test.string); + Assert.assertEquals(elt.toOneLineString(), test.expected.toOneLineString()); + } +} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DiffableReaderUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DiffableReaderUnitTest.java new file mode 100644 index 000000000..e20ba1625 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DiffableReaderUnitTest.java @@ -0,0 +1,173 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.utils.diffengine; + + +// the imports for unit testing. + + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.diffengine.DiffElement; +import org.broadinstitute.gatk.utils.diffengine.DiffEngine; +import org.broadinstitute.gatk.utils.diffengine.DiffNode; +import org.broadinstitute.gatk.utils.diffengine.DiffableReader; +import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.variantcontext.Allele; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +/** + * Basic unit test for DifferableReaders in reduced reads + */ +public class DiffableReaderUnitTest extends BaseTest { + DiffEngine engine; + + File vcfFile = new File(privateTestDir + "diffTestMaster.vcf"); + File bamFile = new File(publicTestDir + "exampleBAM.bam"); + + @BeforeClass(enabled = true) + public void createDiffEngine() { + engine = new DiffEngine(); + } + + @Test(enabled = true) + public void testPluggableDiffableReaders() { + logger.warn("testPluggableDiffableReaders"); + Map readers = engine.getReaders(); + Assert.assertNotNull(readers); + Assert.assertTrue(readers.size() > 0); + Assert.assertNotNull(readers.get("VCF")); + for ( Map.Entry e : engine.getReaders().entrySet() ) { + logger.warn("Found diffable reader: " + e.getKey()); + Assert.assertEquals(e.getValue().getName(), e.getKey()); + Assert.assertEquals(e.getValue(), engine.getReader(e.getKey())); + } + } + + private static void testLeaf(DiffNode rec, String field, Object expected) { + DiffElement value = rec.getElement(field); + Assert.assertNotNull(value, "Expected to see leaf named " + field + " in rec " + rec); + Assert.assertEquals(value.getValue().getValue(), expected, "Expected to see leaf named " + field + " to have value " + expected + " in rec " + rec + " but got instead " + value.getValue().getValue()); + } + + @Test(enabled = true, dependsOnMethods = "testPluggableDiffableReaders") + public void testVCF1() { + logger.warn("testVCF1"); + DiffableReader vcfReader = engine.getReader("VCF"); + Assert.assertTrue(vcfReader.canRead(vcfFile)); + Assert.assertFalse(vcfReader.canRead(bamFile)); + + DiffElement diff = vcfReader.readFromFile(vcfFile, -1); + Assert.assertNotNull(diff); + + Assert.assertEquals(diff.getName(), vcfFile.getName()); + Assert.assertSame(diff.getParent(), DiffElement.ROOT); + + DiffNode node = diff.getValueAsNode(); + Assert.assertEquals(node.getElements().size(), 11); + + // chr1 2646 rs62635284 G A 0.15 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:53,75:3:-12.40,-0.90,-0.00:9.03 + DiffNode rec1 = node.getElement("chr1:2646").getValueAsNode(); + testLeaf(rec1, "CHROM", "chr1"); + testLeaf(rec1, "POS", 2646); + testLeaf(rec1, "ID", "rs62635284"); + testLeaf(rec1, "REF", Allele.create("G", true)); + testLeaf(rec1, "ALT", Arrays.asList(Allele.create("A"))); + testLeaf(rec1, "QUAL", 0.15); + testLeaf(rec1, "FILTER", VCFConstants.PASSES_FILTERS_v4); + testLeaf(rec1, "AC", "2"); + testLeaf(rec1, "AF", "1.00"); + testLeaf(rec1, "AN", "2"); + } + + @Test(enabled = true, dependsOnMethods = "testPluggableDiffableReaders") + public void testBAM() { + logger.warn("testBAM"); + DiffableReader bamReader = engine.getReader("BAM"); + Assert.assertTrue(bamReader.canRead(bamFile)); + Assert.assertFalse(bamReader.canRead(vcfFile)); + + DiffElement diff = bamReader.readFromFile(bamFile, -1); + Assert.assertNotNull(diff); + + Assert.assertEquals(diff.getName(), bamFile.getName()); + Assert.assertSame(diff.getParent(), DiffElement.ROOT); + + DiffNode node = diff.getValueAsNode(); + Assert.assertEquals(node.getElements().size(), 33); + + // 30PPJAAXX090125:1:42:512:1817#0 99 chr1 200 0 76M = + // 255 -130 ACCCTAACCCTAACCCTAACCCTAACCATAACCCTAAGACTAACCCTAAACCTAACCCTCATAATCGAAATACAAC + // BBBBC@C?AABCBB<63>=B@>+B9-9+)2B8,+@327B5A>90((>-+''3?(/'''A)(''19('7.,**%)3: + // PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam + + DiffNode rec1 = node.getElement("30PPJAAXX090125:1:42:512:1817#0_1").getValueAsNode(); + testLeaf(rec1, "NAME", "30PPJAAXX090125:1:42:512:1817#0"); + testLeaf(rec1, "FLAGS", 99); + testLeaf(rec1, "RNAME", "chr1"); + testLeaf(rec1, "POS", 200); + testLeaf(rec1, "MAPQ", 0); + testLeaf(rec1, "CIGAR", "76M"); + testLeaf(rec1, "RNEXT", "chr1"); + testLeaf(rec1, "PNEXT", 255); + testLeaf(rec1, "TLEN", -130); + testLeaf(rec1, "SEQ", "ACCCTAACCCTAACCCTAACCCTAACCATAACCCTAAGACTAACCCTAAACCTAACCCTCATAATCGAAATACAAC"); + testLeaf(rec1, "QUAL", "BBBBC@C?AABCBB<63>=B@>+B9-9+)2B8,+@327B5A>90((>-+''3?(/'''A)(''19('7.,**%)3:"); + testLeaf(rec1, "PG", "0"); + testLeaf(rec1, "RG", "exampleBAM.bam"); + testLeaf(rec1, "SM", "exampleBAM.bam"); + } +} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DifferenceUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DifferenceUnitTest.java new file mode 100644 index 000000000..cee923476 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DifferenceUnitTest.java @@ -0,0 +1,118 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.utils.diffengine; + + +// the imports for unit testing. + + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.diffengine.DiffElement; +import org.broadinstitute.gatk.utils.diffengine.DiffNode; +import org.broadinstitute.gatk.utils.diffengine.Difference; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +/** + * Basic unit test for DifferableReaders in reduced reads + */ +public class DifferenceUnitTest extends BaseTest { + // -------------------------------------------------------------------------------- + // + // testing routines + // + // -------------------------------------------------------------------------------- + + private class DifferenceTest extends TestDataProvider { + public DiffElement tree1, tree2; + public String difference; + + private DifferenceTest(String tree1, String tree2, String difference) { + this(DiffNode.fromString(tree1), DiffNode.fromString(tree2), difference); + } + + private DifferenceTest(DiffElement tree1, DiffElement tree2, String difference) { + super(DifferenceTest.class); + this.tree1 = tree1; + this.tree2 = tree2; + this.difference = difference; + } + + public String toString() { + return String.format("tree1=%s tree2=%s diff=%s", + tree1 == null ? "null" : tree1.toOneLineString(), + tree2 == null ? "null" : tree2.toOneLineString(), + difference); + } + } + + @DataProvider(name = "data") + public Object[][] createTrees() { + new DifferenceTest("A=X", "A=Y", "A:1:X!=Y"); + new DifferenceTest("A=Y", "A=X", "A:1:Y!=X"); + new DifferenceTest(DiffNode.fromString("A=X"), null, "A:1:X!=MISSING"); + new DifferenceTest(null, DiffNode.fromString("A=X"), "A:1:MISSING!=X"); + return DifferenceTest.getTests(DifferenceTest.class); + } + + @Test(enabled = true, dataProvider = "data") + public void testDiffToString(DifferenceTest test) { + logger.warn("Test tree1: " + (test.tree1 == null ? "null" : test.tree1.toOneLineString())); + logger.warn("Test tree2: " + (test.tree2 == null ? "null" : test.tree2.toOneLineString())); + logger.warn("Test expected diff : " + test.difference); + Difference diff = new Difference(test.tree1, test.tree2); + logger.warn("Observed diffs : " + diff); + Assert.assertEquals(diff.toString(), test.difference, "Observed diff string " + diff + " not equal to expected difference string " + test.difference ); + + } +} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java deleted file mode 100644 index 619e96654..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java +++ /dev/null @@ -1,857 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.genotyper; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.variant.variantcontext.Allele; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.tools.walkers.genotyper.*; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; -import org.testng.Assert; -import org.testng.SkipException; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -/** - * Test code for {@link ReadLikelihoods} - * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> - */ -public class ReadLikelihoodsUnitTest -{ - private static final double EPSILON = 1e-6; - private static final int ODD_READ_START = 101; - private static final int EVEN_READ_START = 1; - - @Test(dataProvider = "dataSets") - public void testInstantiationAndQuery(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods result = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - - Assert.assertEquals(result.sampleCount(), samples.length); - Assert.assertEquals(result.alleleCount(), alleles.length); - - - testSampleQueries(samples, reads, result); - testAlleleQueries(alleles, result); - testLikelihoodMatrixQueries(samples, result, null); - } - - @Test(dataProvider = "dataSets") - public void testLikelihoodFillingAndQuery(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods result = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - final double[][][] likelihoods = fillWithRandomLikelihoods(samples, alleles, result); - testLikelihoodMatrixQueries(samples, result, likelihoods); - } - - private double[][][] fillWithRandomLikelihoods(final String[] samples, final Allele[] alleles, final ReadLikelihoods result) { - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); - final double[][][] likelihoods = new double[samples.length][alleles.length][]; - for (int s = 0; s < likelihoods.length; s++) { - final ReadLikelihoods.Matrix sampleLikelihoods = result.sampleMatrix(s); - for (int a = 0; a < likelihoods[s].length; a++) { - likelihoods[s][a] = new double[result.sampleReadCount(s)]; - for (int r = 0; r < likelihoods[s][a].length; r++) - sampleLikelihoods.set(a,r,likelihoods[s][a][r] = -Math.abs(rnd.nextGaussian())); - } - } - return likelihoods; - } - - @Test(dataProvider = "dataSets") - public void testBestAlleles(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - fillWithRandomLikelihoods(samples,alleles,original); - final int alleleCount = alleles.length; - for (int s = 0; s < samples.length; s++) { - final int sampleReadCount = original.sampleReadCount(s); - final ReadLikelihoods.Matrix sampleMatrix = original.sampleMatrix(s); - final double[] bestLkArray = new double[sampleReadCount]; - final int[] bestIndexArray = new int[sampleReadCount]; - final double[] confidenceArray = new double[sampleReadCount]; - for (int r = 0; r < sampleReadCount; r++) { - int bestAlleleIndex = -1; - double bestAlleleLk = Double.NEGATIVE_INFINITY; - double secondBestAlleleLk = Double.NEGATIVE_INFINITY; - for (int a = 0; a < alleleCount; a++) { - final double lk = sampleMatrix.get(a,r); - if (lk > bestAlleleLk) { - secondBestAlleleLk = bestAlleleLk; - bestAlleleLk = lk; - bestAlleleIndex = a; - } else if (lk > secondBestAlleleLk) { - secondBestAlleleLk = lk; - } - } - bestLkArray[r] = bestAlleleLk; - confidenceArray[r] = bestAlleleLk - secondBestAlleleLk; - bestIndexArray[r] = bestAlleleIndex; - } - final Collection.BestAllele> bestAlleles = original.bestAlleles(); - for (final ReadLikelihoods.BestAllele bestAllele : bestAlleles) { - final int readIndex = original.readIndex(s,bestAllele.read); - if (readIndex == -1) continue; - Assert.assertEquals(bestLkArray[readIndex],bestAllele.likelihood); - Assert.assertEquals(bestAllele.allele,alleles[bestIndexArray[readIndex]]); - Assert.assertEquals(bestAllele.confidence,confidenceArray[readIndex],EPSILON); - } - } - } - - @Test(dataProvider = "dataSets") - public void testBestAlleleMap(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - fillWithRandomLikelihoods(samples,alleles,original); - final Map> expected = new HashMap<>(alleles.length); - for (final Allele allele : alleles) - expected.put(allele,new ArrayList()); - - final int alleleCount = alleles.length; - for (int s = 0; s < samples.length; s++) { - final int sampleReadCount = original.sampleReadCount(s); - final ReadLikelihoods.Matrix sampleMatrix = original.sampleMatrix(s); - for (int r = 0; r < sampleReadCount; r++) { - int bestAlleleIndex = -1; - double bestAlleleLk = Double.NEGATIVE_INFINITY; - double secondBestAlleleLk = Double.NEGATIVE_INFINITY; - for (int a = 0; a < alleleCount; a++) { - final double lk = sampleMatrix.get(a,r); - if (lk > bestAlleleLk) { - secondBestAlleleLk = bestAlleleLk; - bestAlleleLk = lk; - bestAlleleIndex = a; - } else if (lk > secondBestAlleleLk) { - secondBestAlleleLk = lk; - } - } - if ((bestAlleleLk - secondBestAlleleLk) > ReadLikelihoods.BestAllele.INFORMATIVE_THRESHOLD) - expected.get(alleles[bestAlleleIndex]).add(sampleMatrix.readAt(r)); - } - } - - final Map> actual = original.readsByBestAlleleMap(); - - Assert.assertEquals(actual.size(),alleles.length); - for (final Allele allele : alleles) { - final List expectedList = expected.get(allele); - final List actualList = actual.get(allele); - final Set expectedSet = new HashSet<>(expectedList); - final Set actualSet = new HashSet<>(actualList); - Assert.assertEquals(actualSet,expectedSet); - } - } - - @Test(dataProvider = "dataSets") - public void testFilterPoorlyModeledReads(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - - for (int s = 0; s < samples.length; s++) { - final int sampleReadCount = original.sampleReadCount(s); - for (int r = 0; r < sampleReadCount; r++) { - if ((r & 1) == 0) continue; - for (int a = 0; a < alleles.length; a++) - original.sampleMatrix(s).set(a,r,-10000); - } - } - - final ReadLikelihoods result = original.clone(); - result.filterPoorlyModeledReads(2.0); - - for (int s = 0; s < samples.length; s++) { - final int oldSampleReadCount = original.sampleReadCount(s); - final int newSampleReadCount = result.sampleReadCount(s); - Assert.assertEquals(newSampleReadCount,(oldSampleReadCount + 1) / 2); - final ReadLikelihoods.Matrix newSampleMatrix = result.sampleMatrix(s); - final ReadLikelihoods.Matrix oldSampleMatrix = original.sampleMatrix(s); - for (int r = 0 ; r < newSampleReadCount; r++) { - Assert.assertEquals(original.readIndex(s, result.sampleReads(s).get(r)), r * 2); - for (int a = 0; a < alleles.length; a++) { - Assert.assertEquals(newSampleMatrix.get(a,r),oldSampleMatrix.get(a,r*2)); - } - } - } - } - - @Test(dataProvider = "dataSets") - public void testFilterReadsToOverlap(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - final GenomeLoc evenReadOverlap = locParser.createGenomeLoc(SAM_HEADER.getSequenceDictionary().getSequences().get(0).getSequenceName(),EVEN_READ_START ,EVEN_READ_START ); - fillWithRandomLikelihoods(samples,alleles,original); - final ReadLikelihoods result = original.clone(); - result.filterToOnlyOverlappingUnclippedReads(evenReadOverlap); - final double[][][] newLikelihoods = new double[samples.length][alleles.length][]; - for (int s = 0; s < samples.length ; s++) - for (int a = 0; a < alleles.length; a++) { - newLikelihoods[s][a] = new double[(original.sampleReadCount(s) + 1) / 2]; - final ReadLikelihoods.Matrix sampleMatrix = original.sampleMatrix(s); - for (int r = 0; r < newLikelihoods[s][a].length; r++) { - Assert.assertEquals(result.readIndex(s,sampleMatrix.readAt(r << 1)),r); - newLikelihoods[s][a][r] = sampleMatrix.get(a, r << 1); - } - } - testLikelihoodMatrixQueries(samples,result,newLikelihoods); - } - - @Test(dataProvider = "marginalizationDataSets") - public void testMarginalizationWithOverlap(final String[] samples, final Allele[] alleles, final Map> reads, final Map> newToOldAlleleMapping) { - final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - final GenomeLoc evenReadOverlap = locParser.createGenomeLoc(SAM_HEADER.getSequenceDictionary().getSequences().get(0).getSequenceName(),EVEN_READ_START ,EVEN_READ_START ); - fillWithRandomLikelihoods(samples, alleles, original); - final ReadLikelihoods marginalized = original.marginalize(newToOldAlleleMapping,evenReadOverlap); - Assert.assertNotNull(marginalized); - Assert.assertEquals(newToOldAlleleMapping.size(),marginalized.alleleCount()); - for (int a = 0; a < marginalized.alleleCount(); a++) { - final List oldAlleles = newToOldAlleleMapping.get(marginalized.alleleAt(a)); - Assert.assertNotNull(oldAlleles); - for (int s = 0; s < samples.length; s++) { - final ReadLikelihoods.Matrix oldSmapleLikelihoods = original.sampleMatrix(s); - final ReadLikelihoods.Matrix sampleLikelihoods = marginalized.sampleMatrix(s); - final int sampleReadCount = sampleLikelihoods.readCount(); - final int oldSampleReadCount = oldSmapleLikelihoods.readCount(); - Assert.assertEquals(sampleReadCount,(oldSampleReadCount + 1) / 2); - for (int r = 0; r < sampleReadCount; r++) { - double oldBestLk = Double.NEGATIVE_INFINITY; - for (final Allele oldAllele : oldAlleles) { - oldBestLk = Math.max(oldSmapleLikelihoods.get(original.alleleIndex(oldAllele),r << 1), oldBestLk); - } - Assert.assertEquals(sampleLikelihoods.get(a,r),oldBestLk); - } - } - } - } - - @Test(dataProvider = "marginalizationDataSets") - public void testMarginalization(final String[] samples, final Allele[] alleles, final Map> reads, final Map> newToOldAlleleMapping) { - final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - fillWithRandomLikelihoods(samples, alleles, original); - final ReadLikelihoods marginalized = original.marginalize(newToOldAlleleMapping); - Assert.assertNotNull(marginalized); - Assert.assertEquals(newToOldAlleleMapping.size(),marginalized.alleleCount()); - for (int a = 0; a < marginalized.alleleCount(); a++) { - final List oldAlleles = newToOldAlleleMapping.get(marginalized.alleleAt(a)); - Assert.assertNotNull(oldAlleles); - for (int s = 0; s < samples.length; s++) { - final ReadLikelihoods.Matrix oldSmapleLikelihoods = original.sampleMatrix(s); - final ReadLikelihoods.Matrix sampleLikelihoods = marginalized.sampleMatrix(s); - final int sampleReadCount = sampleLikelihoods.readCount(); - final int oldSampleReadCount = oldSmapleLikelihoods.readCount(); - Assert.assertEquals(oldSampleReadCount,sampleReadCount); - for (int r = 0; r < sampleReadCount; r++) { - double oldBestLk = Double.NEGATIVE_INFINITY; - for (final Allele oldAllele : oldAlleles) { - oldBestLk = Math.max(oldSmapleLikelihoods.get(original.alleleIndex(oldAllele),r), oldBestLk); - } - Assert.assertEquals(sampleLikelihoods.get(a,r),oldBestLk); - } - } - } - } - - @Test(dataProvider = "dataSets") - public void testNormalizeBestToZero(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); - final ReadLikelihoods result= original.clone(); - result.normalizeLikelihoods(true, Double.NEGATIVE_INFINITY); - testAlleleQueries(alleles,result); - final int alleleCount = alleles.length; - final double[][][] newLikelihoods = new double[originalLikelihoods.length][alleles.length][]; - for (int s = 0; s < samples.length; s++) { - final int sampleReadCount = original.sampleReadCount(s); - for (int a = 0; a < alleleCount; a++) - newLikelihoods[s][a] = new double[sampleReadCount]; - for (int r = 0; r < sampleReadCount; r++) { - double bestLk = originalLikelihoods[s][0][r]; - for (int a = 1; a < alleleCount; a++) { - bestLk = Math.max(bestLk,originalLikelihoods[s][a][r]); - } - for (int a = 0; a < alleleCount; a++) { - newLikelihoods[s][a][r] = originalLikelihoods[s][a][r] - bestLk; - } - } - } - testLikelihoodMatrixQueries(samples,result,newLikelihoods); - } - - @Test(dataProvider = "dataSets") - public void testNormalizeCapWorstLK(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); - final ReadLikelihoods result= original.clone(); - result.normalizeLikelihoods(false, - 0.001); - testAlleleQueries(alleles,result); - final int alleleCount = alleles.length; - final double[][][] newLikelihoods = new double[originalLikelihoods.length][alleles.length][]; - for (int s = 0; s < samples.length; s++) { - final int sampleReadCount = original.sampleReadCount(s); - for (int a = 0; a < alleleCount; a++) - newLikelihoods[s][a] = new double[sampleReadCount]; - for (int r = 0; r < sampleReadCount; r++) { - double bestAltLk = Double.NEGATIVE_INFINITY; - for (int a = 0; a < alleleCount; a++) { - if (alleles[a].isReference()) - continue; - bestAltLk = Math.max(bestAltLk,originalLikelihoods[s][a][r]); - } - if (bestAltLk == Double.NEGATIVE_INFINITY) - for (int a = 0; a < alleleCount; a++) { - newLikelihoods[s][a][r] = originalLikelihoods[s][a][r]; - } - else - for (int a = 0; a < alleleCount; a++) { - newLikelihoods[s][a][r] = Math.max(originalLikelihoods[s][a][r],bestAltLk - 0.001); - } - } - } - testLikelihoodMatrixQueries(samples,result,newLikelihoods); - } - - @Test(dataProvider = "dataSets") - public void testNormalizeCapWorstLKAndBestToZero(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); - final ReadLikelihoods result= original.clone(); - result.normalizeLikelihoods(true, - 0.001); - testAlleleQueries(alleles,result); - final int alleleCount = alleles.length; - final double[][][] newLikelihoods = new double[originalLikelihoods.length][alleles.length][]; - for (int s = 0; s < samples.length; s++) { - final int sampleReadCount = original.sampleReadCount(s); - for (int a = 0; a < alleleCount; a++) - newLikelihoods[s][a] = new double[sampleReadCount]; - for (int r = 0; r < sampleReadCount; r++) { - double bestAltLk = Double.NEGATIVE_INFINITY; - double bestLk = Double.NEGATIVE_INFINITY; - for (int a = 0; a < alleleCount; a++) { - bestLk = Math.max(bestLk,originalLikelihoods[s][a][r]); - if (alleles[a].isReference()) - continue; - bestAltLk = Math.max(bestAltLk,originalLikelihoods[s][a][r]); - } - if (bestAltLk == Double.NEGATIVE_INFINITY) - for (int a = 0; a < alleleCount; a++) { - newLikelihoods[s][a][r] = originalLikelihoods[s][a][r] - bestLk; - } - else - for (int a = 0; a < alleleCount; a++) { - newLikelihoods[s][a][r] = Math.max(originalLikelihoods[s][a][r],bestAltLk - 0.001) - bestLk; - } - } - } - testLikelihoodMatrixQueries(samples,result,newLikelihoods); - } - - - @Test(dataProvider = "dataSets") - public void testAddMissingAlleles(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); - final ReadLikelihoods result = original.clone(); - - // If all the alleles pass are present in the read-likelihoods collection there is no change. - result.addMissingAlleles(result.alleles(),Double.NEGATIVE_INFINITY); - testLikelihoodMatrixQueries(samples,result,originalLikelihoods); - - // If the allele list passed is empty there is no effect. - result.addMissingAlleles(Collections.EMPTY_LIST,Double.NEGATIVE_INFINITY); - testLikelihoodMatrixQueries(samples,result,originalLikelihoods); - - final Allele newOne; - final Allele newTwo; - final Allele newThree; - - // We add a single missing. - result.addMissingAlleles(Arrays.asList(newOne = Allele.create("ACCCCCAAAATTTAAAGGG".getBytes(),false)),-12345.6); - Assert.assertEquals(result.alleleCount(), original.alleleCount() + 1); - - // We add too more amongst exisisting alleles: - result.addMissingAlleles(Arrays.asList(newTwo = Allele.create("ATATATTATATTAATATT".getBytes(), false),result.alleleAt(1), - result.alleleAt(0),newThree = Allele.create("TGTGTGTATTG".getBytes(),false),Allele.create("ACCCCCAAAATTTAAAGGG".getBytes(),false)),-6.54321); - - Assert.assertEquals(original.alleleCount()+3,result.alleleCount()); - - final List expectedAlleles = new ArrayList<>(original.alleles()); - expectedAlleles.add(newOne); expectedAlleles.add(newTwo); expectedAlleles.add(newThree); - - Assert.assertEquals(result.alleles(),expectedAlleles); - - final double[][][] newLikelihoods = new double[originalLikelihoods.length][][]; - for (int s = 0; s < samples.length; s++) { - newLikelihoods[s] = Arrays.copyOf(originalLikelihoods[s],originalLikelihoods[s].length + 3); - final int sampleReadCount = original.sampleReadCount(s); - final int originalAlleleCount = originalLikelihoods[s].length; - newLikelihoods[s][originalAlleleCount] = new double[sampleReadCount]; - Arrays.fill(newLikelihoods[s][originalAlleleCount],-12345.6); - newLikelihoods[s][originalAlleleCount+1] = new double[sampleReadCount]; - Arrays.fill(newLikelihoods[s][originalAlleleCount+1],-6.54321); - newLikelihoods[s][originalAlleleCount+2] = new double[sampleReadCount]; - Arrays.fill(newLikelihoods[s][originalAlleleCount+2],-6.54321); - } - testLikelihoodMatrixQueries(samples,result,newLikelihoods); - } - - - @Test(dataProvider = "dataSets") - public void testAddNonRefAllele(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); - final ReadLikelihoods result = original.clone(); - result.addNonReferenceAllele(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - Assert.assertEquals(result.alleleCount(),original.alleleCount() + 1); - Assert.assertEquals(result.alleleIndex(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE),result.alleleCount() - 1); - final double[][][] newLikelihoods = new double[originalLikelihoods.length][][]; - for (int s = 0; s < samples.length; s++) { - newLikelihoods[s] = Arrays.copyOf(originalLikelihoods[s],originalLikelihoods[s].length + 1); - final int sampleReadCount = original.sampleReadCount(s); - final int ordinaryAlleleCount = originalLikelihoods[s].length; - newLikelihoods[s][ordinaryAlleleCount] = new double[sampleReadCount]; - for (int r = 0; r < sampleReadCount; r++) { - double bestLk = newLikelihoods[s][0][r]; - double secondBestLk = Double.NEGATIVE_INFINITY; - for (int a = 1; a < ordinaryAlleleCount; a++) { - final double lk = originalLikelihoods[s][a][r]; - if (lk > bestLk) { - secondBestLk = bestLk; - bestLk = lk; - } else if (lk > secondBestLk) { - secondBestLk = lk; - } - } - final double expectedNonRefLk = Double.isInfinite(secondBestLk) ? bestLk : secondBestLk; - newLikelihoods[s][ordinaryAlleleCount][r] = expectedNonRefLk; - } - } - testLikelihoodMatrixQueries(samples,result,newLikelihoods); - } - - private void testLikelihoodMatrixQueries(String[] samples, ReadLikelihoods result, final double[][][] likelihoods) { - for (final String sample : samples) { - final int sampleIndex = result.sampleIndex(sample); - final int sampleReadCount = result.sampleReadCount(sampleIndex); - final int alleleCount = result.alleleCount(); - Assert.assertEquals(result.alleleCount(), alleleCount); - for (int a = 0; a < alleleCount; a++) { - Assert.assertEquals(result.sampleReadCount(sampleIndex),sampleReadCount); - for (int r = 0; r < sampleReadCount; r++) - Assert.assertEquals(result.sampleMatrix(sampleIndex).get(a,r), - likelihoods == null ? 0.0 : likelihoods[sampleIndex][a][r], EPSILON); - } - } - } - - private void testAlleleQueries(Allele[] alleles, ReadLikelihoods result) { - final Set alleleIndices = new HashSet<>(); - for (final Allele allele : alleles) { - final int alleleIndex = result.alleleIndex(allele); - Assert.assertTrue(alleleIndex >= 0); - Assert.assertFalse(alleleIndices.contains(alleleIndex)); - alleleIndices.add(alleleIndex); - Assert.assertSame(allele,alleles[alleleIndex]); - } - } - - private void testSampleQueries(String[] samples, Map> reads, ReadLikelihoods result) { - final Set sampleIds = new HashSet<>(samples.length); - for (final String sample : samples) { - final int sampleIndex = result.sampleIndex(sample); - Assert.assertTrue(sampleIndex >= 0); - Assert.assertFalse(sampleIds.contains(sampleIndex)); - sampleIds.add(sampleIndex); - - final List sampleReads = result.sampleReads(sampleIndex); - final Set sampleReadsSet = new HashSet<>(sampleReads); - final List expectedSampleReadArray = reads.get(sample); - final Set expectedSampleReadsSet = new HashSet<>(expectedSampleReadArray); - Assert.assertEquals(sampleReadsSet,expectedSampleReadsSet); - - final int sampleReadCount = sampleReads.size(); - for (int r = 0; r < sampleReadCount; r++) { - Assert.assertSame(sampleReads.get(r), expectedSampleReadArray.get(r)); - final int readIndex = result.readIndex(sampleIndex, sampleReads.get(r)); - Assert.assertEquals(readIndex,r); - } - } - } - - private String[][] SAMPLE_SETS = new String[][] { - {"A","B","C"}, - {"A"}, - {"C","A","D","E","Salsa","Gazpacho"}, - }; - - private Allele[][] ALLELE_SETS = new Allele[][] { - {Allele.create("A",true), Allele.create("T"), Allele.create("C")}, - {Allele.create("A",true)}, - {Allele.create("ATTTA"), Allele.create("A",true)}, - {Allele.create("A"), Allele.create("AT",true)}, - {Allele.create("A",false), Allele.create("AT",false)}, - }; - - @DataProvider(name="marginalizationDataSets") - public Object[][] marginalizationDataSets() { - try { - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); - final Object[][] result = new Object[SAMPLE_SETS.length * ALLELE_SETS.length * ALLELE_SETS.length][]; - int nextIndex = 0; - for (int s = 0; s < SAMPLE_SETS.length; s++) { - for (int a = 0; a < ALLELE_SETS.length; a++) { - for (int b = 0; b < ALLELE_SETS.length; b++) { - if (ALLELE_SETS[b].length < ALLELE_SETS[a].length) - result[nextIndex++] = new Object[]{SAMPLE_SETS[s], ALLELE_SETS[a], - dataSetReads(SAMPLE_SETS[s], rnd), randomAlleleMap(ALLELE_SETS[a], ALLELE_SETS[b]) - }; - } - } - } - return Arrays.copyOf(result,nextIndex); - }catch (final Throwable e) { - throw new RuntimeException(e); - } - } - - private Map> randomAlleleMap(final Allele[] fromAlleles, final Allele[] toAlleles) { - final Map> result = new HashMap<>(toAlleles.length); - for (final Allele toAllele : toAlleles ) - result.put(toAllele,new ArrayList(fromAlleles.length)); - final ArrayList remaining = new ArrayList<>(Arrays.asList(fromAlleles)); - int nextToIndex = 0; - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); - for (int i = 0; i < fromAlleles.length; i++) { - final int fromAlleleIndex = rnd.nextInt(remaining.size()); - result.get(toAlleles[nextToIndex]).add(remaining.remove(fromAlleleIndex)); - nextToIndex = (nextToIndex + 1) % toAlleles.length; - } - return result; - } - - - @DataProvider(name="dataSets") - public Object[][] dataSets() { - try { - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); - final Object[][] result = new Object[SAMPLE_SETS.length * ALLELE_SETS.length][]; - int nextIndex = 0; - for (int s = 0; s < SAMPLE_SETS.length; s++) - for (int a = 0; a < ALLELE_SETS.length; a++) { - result[nextIndex++] = new Object[]{SAMPLE_SETS[s], ALLELE_SETS[a], - dataSetReads(SAMPLE_SETS[s], rnd) - }; - } - return result; - }catch (final Throwable e) { - throw new RuntimeException(e); - } - } - - private Map> dataSetReads(final String[] samples, - final Random rnd) { - final Map> result = new HashMap<>(samples.length); - for (final String sample : samples) { - final int readCount = rnd.nextInt(100); - final List reads = new ArrayList<>(readCount); - for (int r = 0; r < readCount; r++) { - final int alignmentStart = (r & 1) == 0 ? EVEN_READ_START : ODD_READ_START; - reads.add(ArtificialSAMUtils.createArtificialRead(SAM_HEADER, - "RRR" + sample + "00" + r, 0, alignmentStart ,"AAAAA".getBytes(), new byte[] {30,30,30,30,30}, "5M")); - } - result.put(sample,reads); - } - return result; - } - - @Test(dataProvider="readCountsAndAlleleCountDataSkippingNoAlleleAndWithReference") - public void testInstantiationAndBasicQueries(final int[] readCounts, final int alleleCount, final boolean hasReference) { - final SampleList sampleList = sampleList(readCounts); - - final AlleleList alleleList = alleleList(alleleCount,hasReference); - final Map> sampleToReads = ReadLikelihoodsUnitTester.sampleToReads(sampleList, readCounts); - final ReadLikelihoods subject = new ReadLikelihoods<>(sampleList,alleleList,sampleToReads); - - AlleleListUnitTester.assertAlleleList(subject, AlleleListUtils.asList(alleleList)); - SampleListUnitTester.assertSampleList(subject,SampleListUtils.asList(sampleList)); - - if (hasReference) { - final int referenceIndex = AlleleListUtils.indexOfReference(alleleList); - Assert.assertTrue(referenceIndex >= 0); - Assert.assertEquals(AlleleListUtils.indexOfReference(alleleList),referenceIndex); - } else { - Assert.assertEquals(AlleleListUtils.indexOfReference(subject), -1); - } - - testLikelihoodMatrixQueries(alleleList, sampleList, sampleToReads, subject); - testAlleleQueries(alleleList, subject); - testSampleQueries(sampleList, sampleToReads, subject); - } - - @Test(dataProvider="readCountsAndAlleleCountDataSkippingNoLikelihoodsOrNoAlleleAndWithReference") - public void testLikelihoodWriting(final int[] readCounts, final int alleleCount, final boolean hasReference) { - final SampleList sampleList = sampleList(readCounts); - - final AlleleList alleleList = alleleList(alleleCount,hasReference); - final Map> sampleToReads = ReadLikelihoodsUnitTester.sampleToReads(sampleList,readCounts); - final ReadLikelihoods subject = new ReadLikelihoods<>(sampleList,alleleList,sampleToReads); - - final int sampleCount = readCounts.length; - int totalLikelihoodsSet = 0; - int expectedLikelihoodsSet = 0; - for (int s = 0; s < sampleCount; s++) { - expectedLikelihoodsSet += readCounts[s] * alleleCount; - final ReadLikelihoods.Matrix matrix = subject.sampleMatrix(s); - final int readCount = matrix.readCount(); - for (int a = 0; a < alleleCount; a++) - for (int r = 0; r < readCount; r++) { - final double likelihood = testLikelihood(s, a, r); - Assert.assertNotEquals(likelihood,0); //Paranoia - totalLikelihoodsSet++; - matrix.set(a,r,likelihood); - Assert.assertEquals(matrix.get(a, r),likelihood); - } - - } - Assert.assertEquals(totalLikelihoodsSet,expectedLikelihoodsSet); - } - - @Test(dependsOnMethods={"testLikelihoodWriting","testInstantiationAndBasicQueries"}, - dataProvider="readCountsAndAlleleCountDataSkippingNoAlleleAndWithReference") - public void testMapConversion(final int[] readCounts, final int alleleCount, final boolean hasReference) { - final SampleList sampleList = sampleList(readCounts); - - final AlleleList alleleList = alleleList(alleleCount,hasReference); - final Map> sampleToReads = ReadLikelihoodsUnitTester.sampleToReads(sampleList,readCounts); - - final Set alleleWithLikelihoodsSet = new HashSet<>(); - final Set readsWithLikelihoodsSet = new HashSet<>(); - final Map map = new HashMap<>(sampleList.sampleCount()); - final int sampleCount = sampleList.sampleCount(); - for (int s = 0; s < sampleCount; s++) { - final String sample = sampleList.sampleAt(s); - final PerReadAlleleLikelihoodMap perSampleMap = new PerReadAlleleLikelihoodMap(); - final List reads = sampleToReads.get(sample); - for (int a = 0; a < alleleCount; a++) - for (int r = 0; r < reads.size(); r++) { - perSampleMap.add(reads.get(r), alleleList.alleleAt(a), testLikelihood(s, a, r)); - alleleWithLikelihoodsSet.add(alleleList.alleleAt(a)); - readsWithLikelihoodsSet.add(reads.get(r)); - } - map.put(sample,perSampleMap); - - } - - ReadLikelihoods subject = ReadLikelihoods.fromPerAlleleReadLikelihoodsMap(map); - - for (int s = 0; s < sampleCount; s++) { - final String sample = sampleList.sampleAt(s); - final int sIndex = subject.sampleIndex(sample); - Assert.assertTrue(sIndex >= 0); - Assert.assertTrue(sIndex < sampleCount); - final int sampleReadCount = sampleToReads.get(sample).size(); - final ReadLikelihoods.Matrix sampleLikelihoods = subject.sampleMatrix(sIndex); - for (int a = 0; a < alleleCount; a++) { - final Allele allele = alleleList.alleleAt(a); - final int aIndex = subject.alleleIndex(allele); - Assert.assertEquals(aIndex >= 0,alleleWithLikelihoodsSet.contains(allele)); - Assert.assertTrue(aIndex < alleleCount); - if (aIndex == -1) continue; - for (int r = 0; r < sampleReadCount; r++) { - final GATKSAMRecord read = sampleToReads.get(sample).get(r); - final int rIndex = subject.readIndex(sIndex,read); - final int rIndex2 = sampleLikelihoods.readIndex(read); - Assert.assertEquals(rIndex,rIndex2); - Assert.assertEquals(rIndex >= 0,readsWithLikelihoodsSet.contains(read)); - Assert.assertTrue(rIndex < sampleReadCount); - if (rIndex == -1) - continue; - final double likelihood = sampleLikelihoods.get(aIndex,rIndex); - Assert.assertEquals(likelihood,testLikelihood(s,a,r)); - } - } - } - } - - private double testLikelihood(final int sampleIndex, final int alleleIndex, final int readIndex) { - return - Math.abs(31 * (sampleIndex + 1) + 101 * alleleIndex + 1009 * readIndex); - } - - - private final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); - - private void testLikelihoodMatrixQueries(final AlleleList alleles, final SampleList samples, - final Map> sampleToReads, ReadLikelihoods result) { - for (final String sample : SampleListUtils.asList(samples)) { - final int sampleIndex = result.sampleIndex(sample); - final ReadLikelihoods.Matrix likelihoodMatrix = result.sampleMatrix(sampleIndex); - final int sampleReadCount = sampleToReads.get(sample).size(); - final List reads = sampleToReads.get(sample); - Assert.assertEquals(likelihoodMatrix.alleleCount(), alleles.alleleCount()); - Assert.assertEquals(likelihoodMatrix.readCount(), sampleReadCount); - for (int a = 0; a < likelihoodMatrix.alleleCount(); a++) { - Assert.assertEquals(likelihoodMatrix.alleleAt(a),alleles.alleleAt(a)); - for (int r = 0; r < sampleReadCount; r++) { - Assert.assertEquals(likelihoodMatrix.readAt(r),reads.get(r)); - Assert.assertEquals(likelihoodMatrix.get(a, r), 0.0); - } - } - } - } - - private void testAlleleQueries(final AlleleList alleles, ReadLikelihoods result) { - final Set alleleIndices = new HashSet<>(); - for (final Allele allele : AlleleListUtils.asList(alleles)) { - final int alleleIndex = result.alleleIndex(allele); - Assert.assertTrue(alleleIndex >= 0); - Assert.assertFalse(alleleIndices.contains(alleleIndex)); - alleleIndices.add(alleleIndex); - Assert.assertSame(allele,alleles.alleleAt(alleleIndex)); - } - } - - private void testSampleQueries(final SampleList samples, Map> reads, - final ReadLikelihoods result) { - final Set sampleIds = new HashSet<>(samples.sampleCount()); - for (final String sample : SampleListUtils.asList(samples)) { - final int sampleIndex = result.sampleIndex(sample); - Assert.assertTrue(sampleIndex >= 0); - Assert.assertFalse(sampleIds.contains(sampleIndex)); - sampleIds.add(sampleIndex); - - final List sampleReads = result.sampleReads(sampleIndex); - final Set sampleReadsSet = new HashSet<>(sampleReads); - final List expectedSampleReadArray = reads.get(sample); - final Set expectedSampleReadsSet = new HashSet<>(expectedSampleReadArray); - Assert.assertEquals(sampleReadsSet,expectedSampleReadsSet); - - final int sampleReadCount = sampleReads.size(); - for (int r = 0; r < sampleReadCount; r++) { - Assert.assertSame(sampleReads.get(r), expectedSampleReadArray.get(r)); - final int readIndex = result.readIndex(sampleIndex, sampleReads.get(r)); - Assert.assertEquals(readIndex,r); - } - } - } - - private AlleleList alleleList(final int alleleCount, final boolean hasReference) { - final Allele[] alleles = AlleleListUnitTester.generateRandomAlleles(alleleCount,100); - if (hasReference) { - final int referenceIndex = rnd.nextInt(alleleCount); - alleles[referenceIndex] = Allele.create(alleles[referenceIndex].getBases(),true); - } - final AlleleList alleleList = new IndexedAlleleList<>(alleles); - if (alleleList.alleleCount() != alleles.length) - throw new SkipException("repeated alleles, should be infrequent"); - return alleleList; - } - - private SAMFileHeader SAM_HEADER = ArtificialSAMUtils.createArtificialSamHeader(10, 0, 1000); - final GenomeLocParser locParser = new GenomeLocParser(SAM_HEADER.getSequenceDictionary()); - - - private int[][] READ_COUNTS = new int[][] { - {}, - { 100 }, - { 0 }, - { 0, 0, 0 }, - { 1, 0, 1 }, - { 100, 10 , 100}, - { 1000, 10, 100, 20, 23 } - }; - - private int[] ALLELE_COUNTS = new int[] { 0, 1, 2, 3, 10, 20 }; - - @DataProvider(name="readCountsAndAlleleCountData") - public Object[][] readCountsAndAlleleCountData() { - final Object[][] result = new Object[READ_COUNTS.length * ALLELE_COUNTS.length * 2][]; - int index = 0; - for (final int[] readCounts : READ_COUNTS) - for (final int alleleCount : ALLELE_COUNTS) { - result[index++] = new Object[]{ readCounts, alleleCount, false}; - result[index++] = new Object[]{ readCounts, alleleCount, true}; - } - return result; - } - - @DataProvider(name="readCountsAndAlleleCountDataSkippingNoAlleleAndWithReference") - public Object[][] readCountsAndAlleleCountDataSkippingNoAlleleAndWithReference() { - final Object[][] raw = readCountsAndAlleleCountData(); - final List result = new ArrayList<>(raw.length); - for (final Object[] paramSet : raw) - if (!paramSet[2].equals(true) || !paramSet[1].equals(0)) - result.add(paramSet); - return result.toArray(new Object[result.size()][]); - } - - @DataProvider(name="readCountsAndAlleleCountDataSkippingNoLikelihoodsOrNoAlleleAndWithReference") - public Object[][] readCountsAndAlleleCountDataSkippingNoLikelihoodsOrNoAlleleAndWithReference() { - final Object[][] raw = readCountsAndAlleleCountDataSkippingNoAlleleAndWithReference(); - final List result = new ArrayList<>(raw.length); - for (final Object[] paramSet : raw) { - final int[] readCounts = (int[]) paramSet[0]; - final long totalReadCount = MathUtils.sum(readCounts); - if (totalReadCount > 0) - result.add(paramSet); - } - return result.toArray(new Object[result.size()][]); - } - - private SampleList sampleList(final int[] readCounts) { - final List samples = new ArrayList<>(readCounts.length); - for (int i = 0; i < readCounts.length; i++) - samples.add("SAMPLE_" + i); - return new IndexedSampleList(samples); - } - -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java index 5ca1373bf..28fe83d6c 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/GVCFWriterUnitTest.java @@ -57,6 +57,7 @@ import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFHeader; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import org.broadinstitute.gatk.utils.variant.HomoSapiensConstants; import org.testng.Assert; @@ -95,7 +96,7 @@ public class GVCFWriterUnitTest extends BaseTest { private List standardPartition = Arrays.asList(1, 10, 20); private Allele REF = Allele.create("N", true); private Allele ALT = Allele.create("A"); - private List ALLELES = Arrays.asList(REF, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + private List ALLELES = Arrays.asList(REF, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); private final String SAMPLE_NAME = "XXYYZZ"; @BeforeMethod @@ -268,10 +269,10 @@ public class GVCFWriterUnitTest extends BaseTest { Assert.assertEquals(vc.getStart(), start); Assert.assertEquals(vc.getEnd(), stop); if ( nonRef ) { - Assert.assertNotEquals(vc.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + Assert.assertNotEquals(vc.getAlternateAllele(0), GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); } else { Assert.assertEquals(vc.getNAlleles(), 2); - Assert.assertEquals(vc.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + Assert.assertEquals(vc.getAlternateAllele(0), GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE); Assert.assertEquals(vc.getAttributeAsInt(VCFConstants.END_KEY, -1), stop); Assert.assertTrue(vc.hasGenotypes()); Assert.assertTrue(vc.hasGenotype(SAMPLE_NAME)); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlockUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlockUnitTest.java index 59da23416..88b5b6bbc 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlockUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/gvcf/HomRefBlockUnitTest.java @@ -167,4 +167,10 @@ public class HomRefBlockUnitTest extends BaseTest { final VariantContext testVC = new VariantContextBuilder(vc).chr(contig).start(pos).stop(pos).make(); Assert.assertEquals(band.isContiguous(testVC), expected); } + + @Test + public void testToVCFHeaderLine() { + final HomRefBlock band = new HomRefBlock(vc, 10, 20, HomoSapiensConstants.DEFAULT_PLOIDY); + Assert.assertEquals(band.toVCFHeaderLine().getKey(), "GVCFBlock10-20", "Wrong key for HomRefBlock " + band); + } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeBaseComparatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeBaseComparatorUnitTest.java deleted file mode 100644 index 528da1762..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeBaseComparatorUnitTest.java +++ /dev/null @@ -1,82 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.haplotype; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.Utils; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -public class HaplotypeBaseComparatorUnitTest extends BaseTest { - @Test - public void testComparison() { - final List rawStrings = Arrays.asList("A", "C", "AC", "CT", "GTC", "ACGT"); - final List lexStrings = new ArrayList(rawStrings); - Collections.sort(lexStrings); - - for ( final List seqs : Utils.makePermutations(lexStrings, lexStrings.size(), false) ) { - final List haps = new ArrayList(seqs.size()); - for ( final String seq : seqs ) { - haps.add(new Haplotype(seq.getBytes(), false)); - } - - Collections.sort(haps, new HaplotypeBaseComparator()); - for ( int i = 0; i < lexStrings.size(); i++ ) - Assert.assertEquals(haps.get(i).getBaseString(), lexStrings.get(i), "Failed sort " + haps + " expected " + lexStrings); - } - } -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculatorUnitTest.java deleted file mode 100644 index aae88fbfb..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculatorUnitTest.java +++ /dev/null @@ -1,123 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.haplotype; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.testng.Assert; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -public class HaplotypeLDCalculatorUnitTest extends BaseTest { - HaplotypeLDCalculator calculator; - - @BeforeMethod - public void setUp() throws Exception { - calculator = new HaplotypeLDCalculator(); - } - - /** - * Tests that we get the right values from the R^2 calculation - */ - @Test - public void computeProbOfBeingPhased() { - logger.warn("Executing testCalculateR2LD"); - - // See AA, AB, and BA in population - Assert.assertEquals(calculator.pPhasedTest(0, 0, 0, -100), 0, 0.00001); - - // See AA, AB, BB in population - Assert.assertTrue(calculator.pPhasedTest(0, 0, -100, 0) < 0.5); - - // See AA and BB in population - Assert.assertEquals(calculator.pPhasedTest(0, -100, -100, 0), 1, 0.00001); - - // See AA, AB, and BA but no BBs in population - Assert.assertEquals(calculator.pPhasedTest(0, -20, -40, Double.NEGATIVE_INFINITY), 0, 0.00001); - - // See BB, AB, and BA but no AAs in population, so BB is the best explanation - Assert.assertEquals(calculator.pPhasedTest(Double.NEGATIVE_INFINITY, -20, -40, 0), 1, 0.00001); - - // See only AB and BA but no AAs nor BBs in population - Assert.assertEquals(calculator.pPhasedTest(Double.NEGATIVE_INFINITY, -20, -40, Double.NEGATIVE_INFINITY), 0, 0.00001); - - // Previously bad input - Assert.assertEquals(calculator.pPhasedTest(-400, -600, -1200, Double.NEGATIVE_INFINITY), 0, 0.00001); - - // first variant is just bad, so BA and BB are both very bad, shouldn't be phased - Assert.assertEquals(calculator.pPhasedTest(0, -1000, -100, -10000), 0, 0.00001); - - // second variant is just bad, so AB and BB are both very bad, shouldn't be phased - Assert.assertEquals(calculator.pPhasedTest(0, -100, -1000, -10000), 0, 0.00001); - - // AA is very good, all all others are quite poor. Shouldn't be phased - Assert.assertEquals(calculator.pPhasedTest(0, -1000, -1000, -10000), 0, 0.00001); - - - for ( int i = -10; i > -10000; i -= 10 ) { - // only bad het states - Assert.assertTrue(calculator.pPhasedTest(0, i, i, 0) > 0.99, "Failed for " + i); - - // BB state is terrible - Assert.assertTrue(calculator.pPhasedTest(0, 0, 0, i) < 0.5, "Failed for " + i); - - // truth is AB, BA, and BB - Assert.assertTrue(calculator.pPhasedTest(i, 0, 0, 0) < 0.5, "Failed for " + i); - - // truth is AB, BA - Assert.assertTrue(calculator.pPhasedTest(i, 0, 0, i) < 0.5, "Failed for " + i); - - // Only good signal is AB, so we shouldn't be phased - Assert.assertTrue(calculator.pPhasedTest(i, i, 0, i) < 0.5, "Failed for " + i); - Assert.assertTrue(calculator.pPhasedTest(i, 0, i, i) < 0.5, "Failed for " + i); - } - } -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeScoreComparatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeScoreComparatorUnitTest.java deleted file mode 100644 index ea368e631..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeScoreComparatorUnitTest.java +++ /dev/null @@ -1,81 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.haplotype; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.Utils; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -public class HaplotypeScoreComparatorUnitTest extends BaseTest { - @Test - public void testComparison() { - final List scores = Arrays.asList(3.0, 2.0, 1.0); - for ( final List myScores : Utils.makePermutations(scores, scores.size(), false) ) { - final List haps = new ArrayList(myScores.size()); - for ( final double score : myScores ) { - final Haplotype h = new Haplotype("ACT".getBytes(), false); - h.setScore(score); - haps.add(h); - } - - Collections.sort(haps, new HaplotypeScoreComparator()); - for ( int i = 0; i < myScores.size(); i++ ) - Assert.assertEquals(haps.get(i).getScore(), scores.get(i)); - } - } -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeSizeAndBaseComparatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeSizeAndBaseComparatorUnitTest.java deleted file mode 100644 index 1808ac19a..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeSizeAndBaseComparatorUnitTest.java +++ /dev/null @@ -1,87 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.haplotype; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.Utils; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -/** - * User: btaylor - * Date: 8/1/13 - * Time: 11:09 AM - */ -public class HaplotypeSizeAndBaseComparatorUnitTest extends BaseTest { - @Test - public void testComparison() { - // desired ordering is by size first, subordered by lexacographic relationship between bases - final List rawStrings = Arrays.asList("A", "C", "AC", "CC", "CT", "AAT", "ACT", "GAT", "ACGT"); - final List lexStrings = new ArrayList<>(rawStrings); - - for ( final List seqs : Utils.makePermutations(lexStrings, lexStrings.size(), false) ) { - final List haps = new ArrayList<>(seqs.size()); - for ( final String seq : seqs ) { - haps.add(new Haplotype(seq.getBytes(), false)); - } - - Collections.sort(haps, new HaplotypeSizeAndBaseComparator()); - for ( int i = 0; i < lexStrings.size(); i++ ) - Assert.assertEquals(haps.get(i).getBaseString(), lexStrings.get(i), "Failed sort " + haps + " expected " + lexStrings); - } - } -} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/LDMergerUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/LDMergerUnitTest.java deleted file mode 100644 index 337a91e44..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/LDMergerUnitTest.java +++ /dev/null @@ -1,339 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.haplotype; - -import htsjdk.samtools.TextCigarCodec; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.*; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.VariantContextBuilder; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.TreeSet; - -public class LDMergerUnitTest extends BaseTest { - LDMerger merger; - GenomeLocParser genomeLocParser; - - @BeforeClass - public void init() throws FileNotFoundException { - genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(b37KGReference))); - } - - @BeforeMethod - public void setUp() throws Exception { - merger = new LDMerger(); - } - - @Test - public void testCreateMergedVariantContext() { - logger.warn("Executing testCreateMergedVariantContext"); - - final byte[] ref = "AATTCCGGAATTCCGGAATT".getBytes(); - final GenomeLoc refLoc = genomeLocParser.createGenomeLoc("2", 1700, 1700 + ref.length); - - // SNP + SNP = simple MNP - VariantContext thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); - VariantContext nextVC = new VariantContextBuilder().loc("2", 1704, 1704).alleles("C","G").make(); - VariantContext truthVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","GG").source("merged").make(); - VariantContext mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // SNP + ref + SNP = MNP with ref base gap - thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","GCG").source("merged").make(); - mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // insertion + SNP - thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TAAAAA").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TAAAAACG").source("merged").make(); - mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // SNP + insertion - thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","CAAAAA").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","GCCAAAAA").source("merged").make(); - mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // deletion + SNP - thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","T").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TG").source("merged").make(); - mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // SNP + deletion - thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","GCC").source("merged").make(); - mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // insertion + deletion = MNP - thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TA").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); - truthVC = new VariantContextBuilder().loc("2", 1704, 1706).alleles("CCG","ACC").source("merged").make(); - mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // insertion + deletion - thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TAAAAA").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","TAAAAACC").source("merged").make(); - mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // insertion + insertion - thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TA").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","CA").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TACCA").source("merged").make(); - mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // deletion + deletion - thisVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","A").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); - truthVC = new VariantContextBuilder().loc("2", 1701, 1706).alleles("ATTCCG","ATCC").source("merged").make(); - mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // deletion + insertion (abutting) - thisVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","A").make(); - nextVC = new VariantContextBuilder().loc("2", 1702, 1702).alleles("T","GCGCGC").make(); - truthVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","AGCGCGC").source("merged").make(); - mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // complex + complex - thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","AAA").make(); - nextVC = new VariantContextBuilder().loc("2", 1706, 1707).alleles("GG","AC").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1707).alleles("TCCGG","AAACAC").source("merged").make(); - mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - } - - @Test - public void testInsertionDeletionBecomingNullAllele() { - final byte[] ref = "CAAA".getBytes(); - final GenomeLoc refLoc = genomeLocParser.createGenomeLoc("2", 1700, 1700 + ref.length); - - // insertion + deletion results in a null allele, should return false - final VariantContext thisVC = new VariantContextBuilder().loc("2", 1700, 1701).alleles("CA","C").make(); - final VariantContext nextVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("A","AA").make(); - final VariantContext mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - Assert.assertNull(mergedVC, "Insertion deletion becoming a null allele should return a null variant context"); - } - - /** - * Just returns a given R2 value for testing - */ - private static class MockLDCalculator extends HaplotypeLDCalculator { - private final double R2; - - private MockLDCalculator(double r2) { - R2 = r2; - } - - @Override - protected double computeProbOfBeingPhased(VariantContext first, VariantContext second) { - return R2; - } - } - - @DataProvider(name = "R2MergerData") - public Object[][] makeR2MergerData() { - List tests = new ArrayList(); - - // this functionality can be adapted to provide input data for whatever you might want in your data - final double thres = LDMerger.MERGE_EVENTS_PROB_PHASED_THRESHOLD; - for ( final double r2 : Arrays.asList(0.0, thres - 0.01, thres + 0.01, 1.0) ) { - tests.add(new Object[]{"ACGT", "CCGC", 2, "4M", "ACGT", "CCGC", r2, r2 >= thres}); - tests.add(new Object[]{"ACGT", "AGGC", 2, "4M", "CGT", "GGC", r2, r2 >= thres}); - tests.add(new Object[]{"ACGT", "ACCC", 2, "4M", "GT", "CC", r2, r2 >= thres}); - tests.add(new Object[]{"ACGT", "ACCGTT", 2, "2M1I1M1I1M", "CG", "CCGT", r2, r2 >= thres}); - tests.add(new Object[]{"ACGT", "AGCT", 2, "4M", "CG", "GC", r2, r2 >= thres}); - tests.add(new Object[]{"ACAGT", "AAGC", 2, "1M1D3M", "ACAGT", "AAGC", r2, r2 >= thres}); - tests.add(new Object[]{"ACAGT", "AAT", 2, "1M1D1M1D1M", "ACAG", "AA", r2, r2 >= thres}); - - // cannot be merged -- only 1 event - tests.add(new Object[]{"AAA", "ACA", 1, "3M", null, null, r2, false}); - - final int dist = LDMerger.MAX_DISTANCE_BETWEEN_SNPS_TO_MERGE + 2; - tests.add(new Object[]{Utils.dupString("A", dist), "C" + Utils.dupString("A", dist - 2) + "C", 2, dist + "M", null, null, r2, false}); - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "R2MergerData") - public void testR2Merger(final String refS, final String hapS, int nEvents, final String cigar, final String expectedMergedRef, final String expectedMergedAlt, final double r2, final boolean expectMerge) { - final Haplotype ref = new Haplotype(refS.getBytes(), true, 0, TextCigarCodec.getSingleton().decode(refS.length() + "M")); - final Haplotype hap = new Haplotype(hapS.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); - final GenomeLoc loc = new UnvalidatingGenomeLoc("1", 0, 1, ref.length()); - - final List haplotypes = Arrays.asList(ref, hap); - final TreeSet vcStarts = EventMap.buildEventMapsForHaplotypes(haplotypes, ref.getBases(), loc, false); - final MockLDCalculator r2Calc = new MockLDCalculator(r2); - - Assert.assertEquals(vcStarts.size(), nEvents); - final boolean merged = merger.mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calc, 1, vcStarts, ref.getBases(), loc); - Assert.assertEquals(merged, expectMerge); - Assert.assertEquals(vcStarts.size(), expectMerge ? 1 : nEvents); - if ( expectMerge ) { - final VariantContext vc = hap.getEventMap().getVariantContexts().iterator().next(); - Assert.assertTrue(vc.isBiallelic()); - Assert.assertEquals(vc.getReference().getDisplayString(), expectedMergedRef); - Assert.assertEquals(vc.getAlternateAllele(0).getDisplayString(), expectedMergedAlt); - } - } - - @Test - public void testR2MergerWithThirdHapWithoutEvent() { - final String refS = "ACGT"; - final String hapS = "CCGA"; - final String cigar = "4M"; - final Haplotype ref = new Haplotype(refS.getBytes(), true, 0, TextCigarCodec.getSingleton().decode(refS.length() + "M")); - final Haplotype hap1 = new Haplotype(hapS.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); - final Haplotype hap2 = new Haplotype("ACGA".getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); - final GenomeLoc loc = new UnvalidatingGenomeLoc("1", 0, 1, ref.length()); - - final List haplotypes = Arrays.asList(ref, hap1, hap2); - final TreeSet vcStarts = EventMap.buildEventMapsForHaplotypes(haplotypes, ref.getBases(), loc, false); - final MockLDCalculator r2Calc = new MockLDCalculator(1.0); - - Assert.assertEquals(vcStarts.size(), 2); - final boolean merged = merger.mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calc, 1, vcStarts, ref.getBases(), loc); - Assert.assertEquals(merged, true); - Assert.assertEquals(vcStarts.size(), 1); - - final VariantContext vc = hap1.getEventMap().getVariantContexts().iterator().next(); - Assert.assertTrue(vc.isBiallelic()); - Assert.assertEquals(vc.getReference().getDisplayString(), "ACGT"); - Assert.assertEquals(vc.getAlternateAllele(0).getDisplayString(), "CCGA"); - - Assert.assertEquals(hap2.getEventMap().size(), 0); - } - - @Test - public void testR2MergerWithMultipleAllelesAtSites() { - final String refS = "ACGT"; - final String hapS = "TCGA"; - final String cigar = "4M"; - final Haplotype ref = new Haplotype(refS.getBytes(), true, 0, TextCigarCodec.getSingleton().decode(refS.length() + "M")); - final Haplotype hap1 = new Haplotype(hapS.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); - - final GenomeLoc loc = new UnvalidatingGenomeLoc("1", 0, 1, ref.length()); - for (final String hap2S : Arrays.asList("GCGA", "TCGG")) { - final Haplotype hap2 = new Haplotype(hap2S.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); - - final List haplotypes = Arrays.asList(ref, hap1, hap2); - final TreeSet vcStarts = EventMap.buildEventMapsForHaplotypes(haplotypes, ref.getBases(), loc, false); - final MockLDCalculator r2Calc = new MockLDCalculator(1.0); - - Assert.assertEquals(vcStarts.size(), 2); - final boolean merged = merger.mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calc, 1, vcStarts, ref.getBases(), loc); - Assert.assertEquals(merged, false); - Assert.assertEquals(vcStarts.size(), 2); - } - } -} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java index bfd588cab..e2f703e01 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java @@ -82,7 +82,7 @@ public class HaplotypeBAMWriterUnitTest extends BaseTest { private Haplotype makeHaplotype(final String bases, final String cigar) { final Haplotype hap = new Haplotype(bases.getBytes()); - hap.setCigar(TextCigarCodec.getSingleton().decode(cigar)); + hap.setCigar(TextCigarCodec.decode(cigar)); return hap; } @@ -160,7 +160,7 @@ public class HaplotypeBAMWriterUnitTest extends BaseTest { final String badCigar = "31M6D211M"; final String goodCigar = "28M6D214M"; final Haplotype badHap = new Haplotype(hap.getBytes()); - badHap.setCigar(TextCigarCodec.getSingleton().decode(hapCigar)); + badHap.setCigar(TextCigarCodec.decode(hapCigar)); badHap.setAlignmentStartHapwrtRef(hapStart); final int expectedPos = 10130740; @@ -177,10 +177,10 @@ public class HaplotypeBAMWriterUnitTest extends BaseTest { final GATKSAMRecord originalReadCopy = (GATKSAMRecord)read.clone(); if ( expectedReadCigar == null ) { - Assert.assertNull(AlignmentUtils.createReadAlignedToRef(read, haplotype, refStart, true)); + Assert.assertNull(AlignmentUtils.createReadAlignedToRef(read, haplotype, haplotype, refStart, true)); } else { - final Cigar expectedCigar = TextCigarCodec.getSingleton().decode(expectedReadCigar); - final GATKSAMRecord alignedRead = AlignmentUtils.createReadAlignedToRef(read, haplotype, refStart, true); + final Cigar expectedCigar = TextCigarCodec.decode(expectedReadCigar); + final GATKSAMRecord alignedRead = AlignmentUtils.createReadAlignedToRef(read, haplotype, haplotype, refStart, true); Assert.assertEquals(alignedRead.getReadName(), originalReadCopy.getReadName()); Assert.assertEquals(alignedRead.getAlignmentStart(), expectedReadStart); @@ -290,7 +290,7 @@ public class HaplotypeBAMWriterUnitTest extends BaseTest { @Test(dataProvider = "ComplexReadAlignedToRef", enabled = !DEBUG) public void testReadAlignedToRefComplexAlignment(final int testIndex, final GATKSAMRecord read, final String reference, final Haplotype haplotype, final int expectedMaxMismatches) throws Exception { final HaplotypeBAMWriter writer = new CalledHaplotypeBAMWriter(new MockDestination()); - final GATKSAMRecord alignedRead = AlignmentUtils.createReadAlignedToRef(read, haplotype, 1, true); + final GATKSAMRecord alignedRead = AlignmentUtils.createReadAlignedToRef(read, haplotype, new Haplotype(reference.getBytes(),true), 1, true); if ( alignedRead != null ) { final int mismatches = AlignmentUtils.getMismatchCount(alignedRead, reference.getBytes(), alignedRead.getAlignmentStart() - 1).numMismatches; Assert.assertTrue(mismatches <= expectedMaxMismatches, diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/nanoScheduler/NanoSchedulerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/nanoScheduler/NanoSchedulerIntegrationTest.java deleted file mode 100644 index f962260e9..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/nanoScheduler/NanoSchedulerIntegrationTest.java +++ /dev/null @@ -1,101 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.nanoScheduler; - -import org.broadinstitute.gatk.engine.walkers.WalkerTest; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -// ********************************************************************************** // -// Note that this class also serves as an integration test for the VariantAnnotator! // -// ********************************************************************************** // - -public class NanoSchedulerIntegrationTest extends WalkerTest { - @DataProvider(name = "NanoSchedulerUGTest") - public Object[][] createNanoSchedulerUGTest() { - List tests = new ArrayList(); - - for ( final int nt : Arrays.asList(1, 2) ) - for ( final int nct : Arrays.asList(1, 2) ) { - tests.add(new Object[]{ "BOTH", "18418ddc2bdbe20c38ece6dd18535be7", nt, nct }); - } - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = true, dataProvider = "NanoSchedulerUGTest") - private void testNanoSchedulerUGTest(final String glm, final String md5, final int nt, final int nct ) { - WalkerTestSpec spec = new WalkerTestSpec( - buildCommandLine( - "-T UnifiedGenotyper -R " + b37KGReference, - "--no_cmdline_in_header -G none", - "-I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", - "-L 20:10,000,000-10,100,000", - "-glm " + glm, - "--contamination_fraction_to_filter 0.0", - "-nt " + nt, - "-nct " + nct, - "-o %s" - ), - 1, - Arrays.asList(md5) - ); - executeTest(String.format("testUG-glm:%s-nt%d-nct%d", glm, nt, nct), spec); - } - - - -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/ActiveRegionTestDataSet.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/ActiveRegionTestDataSet.java deleted file mode 100644 index a41db9386..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/ActiveRegionTestDataSet.java +++ /dev/null @@ -1,593 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.pairhmm; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.SAMSequenceRecord; -import org.apache.commons.math.distribution.ExponentialDistribution; -import org.broadinstitute.gatk.tools.walkers.haplotypecaller.AssemblyResult; -import org.broadinstitute.gatk.tools.walkers.haplotypecaller.AssemblyResultSet; -import org.broadinstitute.gatk.tools.walkers.haplotypecaller.Civar; -import org.broadinstitute.gatk.tools.walkers.haplotypecaller.readthreading.ReadThreadingGraph; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.haplotype.Haplotype; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** -* Mock-up active region data used in testing. -* -* @author Valentin Ruano-Rubio <valentin@broadinstitute.org> -*/ -public class ActiveRegionTestDataSet { - - private final byte[] referenceBytes; - protected String reference; - protected String[] haplotypeCigars; - protected List haplotypeStrings; - protected String[] readCigars; - protected byte[] bq; - protected byte[] dq; - protected byte[] iq; - protected int kmerSize; - private List haplotypeList; - private List readList; - private AssemblyResultSet assemblyResultSet; - private Map readBySequence; - private String stringRepresentation; - private List> readEventOffsetList; - private GenomeLocParser genomeLocParser; - - /** Create a new active region data test set */ - public ActiveRegionTestDataSet(final int kmerSize, final String reference, final String[] haplotypes, - final String[] readCigars, final byte[] bq, final byte[] dq, final byte[] iq) { - this.reference = reference; - this.referenceBytes = reference.getBytes(); - this.haplotypeCigars = haplotypes; - this.readCigars = readCigars; - this.bq = bq; - this.dq = dq; - this.iq = iq; - this.kmerSize = kmerSize; - this.genomeLocParser = new GenomeLocParser(ArtificialSAMUtils.createArtificialSamHeader(1,1,reference.length()).getSequenceDictionary()); - } - - public String getReference() { - return reference; - } - - public String toString() { - if (stringRepresentation == null) - return super.toString(); - else return stringRepresentation; - } - - public AssemblyResultSet assemblyResultSet() { - if (assemblyResultSet == null) { - final ReadThreadingGraph rtg = new ReadThreadingGraph(kmerSize); - rtg.addSequence("anonymous", this.getReference().getBytes(), true); - for (final String haplotype : this.haplotypesStrings()) { - rtg.addSequence("anonymous", haplotype.getBytes(), false); - } - rtg.buildGraphIfNecessary(); - if (rtg.hasCycles()) - throw new RuntimeException("there is cycles in the reference with kmer size " + kmerSize + ". Don't use this size for the benchmark or change the reference"); - - List haplotypeList = this.haplotypeList(); - - assemblyResultSet = new AssemblyResultSet(); - final AssemblyResult ar = new AssemblyResult((haplotypeList.size() > 1 ? - AssemblyResult.Status.ASSEMBLED_SOME_VARIATION : AssemblyResult.Status.JUST_ASSEMBLED_REFERENCE),rtg.convertToSequenceGraph()); - ar.setThreadingGraph(rtg); - - for (final Haplotype h : haplotypeList) - assemblyResultSet.add(h, ar); - } - return assemblyResultSet; - } - - public List haplotypesStrings() { - if (haplotypeStrings != null) { - return haplotypeStrings; - } - final List result = new ArrayList<>(haplotypeCigars.length); - String reference = this.reference; - for (final String cigar : haplotypeCigars) { - if (cigar.matches("^Civar:.*$")) { - stringRepresentation = cigar.substring(6); - result.addAll(expandAllCombinations(cigar.substring(6),reference)); - } else if (cigar.matches("^.*\\d+.*$")) { - result.add(applyCigar(reference, cigar,0,true)); - } else { - result.add(cigar); - } - } - haplotypeStrings = result; - return result; - } - - private List expandAllCombinations(final String cigarString, final String reference) { - final Civar civar = Civar.fromCharSequence(cigarString); - final List unrolledCivars = civar.optionalizeAll().unroll(); - List result = new ArrayList<>(unrolledCivars.size()); - for (final Civar c : unrolledCivars) { - result.add(c.applyTo(reference)); - } - return result; - } - - private List expandAllHaplotypeCombinations(final String civarString, final String reference) { - final Civar civar = Civar.fromCharSequence(civarString); - final List unrolledCivars = civar.optionalizeAll().unroll(); - List result = new ArrayList<>(unrolledCivars.size()); - for (final Civar c : unrolledCivars) { - final String baseString = c.applyTo(reference); - final Haplotype haplotype = new Haplotype(baseString.getBytes(),baseString.equals(reference)); - haplotype.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,reference.length())); - try { - haplotype.setCigar(c.toCigar(reference.length())); - } catch (final RuntimeException ex) { - c.applyTo(reference); - c.toCigar(reference.length()); - throw new RuntimeException("" + c + " " + ex.getMessage(),ex); - } - result.add(haplotype); - } - return result; - } - - - public List haplotypeList() { - if (haplotypeList == null) { - - final List result = new ArrayList<>(haplotypeCigars.length); - final String reference = this.reference; - for (final String cigar : haplotypeCigars) { - if (cigar.matches("^Civar:.*$")) { - stringRepresentation = cigar.substring(6); - result.addAll(expandAllHaplotypeCombinations(cigar.substring(6), reference)); - } else if (cigar.matches("^.*\\d+.*$")) { - result.add(cigarToHaplotype(reference, cigar, 0, true)); - } else { - final Haplotype h = new Haplotype(cigar.getBytes()); - h.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,reference.length())); - result.add(h); - } - } - haplotypeList = result; - } - return haplotypeList; - } - - - protected SAMSequenceDictionary artificialSAMSequenceDictionary() { - return new SAMSequenceDictionary(Collections.singletonList(new SAMSequenceRecord("00",reference.length()))); - } - - protected SAMFileHeader artificialSAMFileHeader() { - return ArtificialSAMUtils.createArtificialSamHeader(artificialSAMSequenceDictionary()); - } - - public List readList() { - if (readList == null) { - final SAMFileHeader header = artificialSAMFileHeader(); - readList = new ArrayList<>(readCigars.length); - final List haplotypes = haplotypesStrings(); - int count = 0; - for (final String descr : readCigars) { - String sequence; - if (descr.matches("^\\d+:\\d+:.+$")) { - final String[] parts = descr.split(":"); - int allele = Integer.valueOf(parts[0]); - int offset = Integer.valueOf(parts[1]); - final String cigar = parts[2]; - final String base = allele == 0 ? reference : haplotypes.get(allele - 1); - sequence = applyCigar(base, cigar, offset, false); - final GATKSAMRecord samRecord = ArtificialSAMUtils.createArtificialRead(header, "read_" + count, 0, 1, sequence.getBytes(), Arrays.copyOf(bq, sequence.length())); - readList.add(new MyGATKSAMRecord(samRecord)); - } else if (descr.matches("^\\*:\\d+:\\d+$")) { - int readCount = Integer.valueOf(descr.split(":")[1]); - int readLength = Integer.valueOf(descr.split(":")[2]); - readList.addAll(generateSamRecords(haplotypes, readCount, readLength, header, count)); - } else { - sequence = descr; - final GATKSAMRecord samRecord = ArtificialSAMUtils.createArtificialRead(header, "read_" + count, 0, 1, sequence.getBytes(), Arrays.copyOf(bq, sequence.length())); - readList.add(new MyGATKSAMRecord(samRecord)); - } - count = readList.size(); - } - } - return readList; - } - - public List> readEventOffsetList() { - if (haplotypeCigars.length != 1 || !haplotypeCigars[0].startsWith("Civar:")) - throw new UnsupportedOperationException(); - if (readEventOffsetList == null) { - final Civar civar = Civar.fromCharSequence(haplotypeCigars[0].substring(6)); - final List unrolledCivars = civar.optionalizeAll().unroll(); - - readEventOffsetList = new ArrayList<>(readCigars.length); - int count = 0; - for (final String descr : readCigars) { - if (descr.matches("^\\d+:\\d+:.+$")) { - throw new UnsupportedOperationException(); - } else if (descr.matches("^\\*:\\d+:\\d+$")) { - int readCount = Integer.valueOf(descr.split(":")[1]); - int readLength = Integer.valueOf(descr.split(":")[2]); - readEventOffsetList.addAll(generateElementOffsetRecords(haplotypesStrings(), unrolledCivars, readCount, readLength, count)); - } else { - throw new UnsupportedOperationException(); - } - count = readEventOffsetList.size(); - } - readEventOffsetList = Collections.unmodifiableList(readEventOffsetList); - } - return readEventOffsetList; - } - - - - - @SuppressWarnings("unused") - public String cigarToSequence(final String cigar) { - String reference = this.reference; - return applyCigar(reference, cigar,0,true); - } - - @SuppressWarnings("unused") - public GATKSAMRecord readFromString(final String readSequence) { - if (readBySequence == null) { - final List readList = readList(); - readBySequence = new HashMap<>(readList.size()); - for (final GATKSAMRecord r : readList) - readBySequence.put(r.getReadString(),r); - } - return readBySequence.get(readSequence); - } - - public List unrolledCivars() { - if (haplotypeCigars.length != 1 || !haplotypeCigars[0].startsWith("Civar:")) - throw new UnsupportedOperationException(); - final Civar civar = Civar.fromCharSequence(haplotypeCigars[0].substring(6)); - return civar.optionalizeAll().unroll(); - } - - public void introduceErrors(final Random rnd) { - final List reads = readList(); - final ArrayList result = new ArrayList<>(reads.size()); - for (final GATKSAMRecord read : reads) { - result.add(new MyGATKSAMRecord(read,rnd)); - } - readList = result; - } - - private class MyGATKSAMRecord extends GATKSAMRecord { - protected MyGATKSAMRecord(final GATKSAMRecord r) { - super(r.getHeader(), r.getReferenceIndex(), r.getAlignmentStart(), (short) r.getReadNameLength(), - (short) 100, -1, r.getCigarLength(), r.getFlags(), r.getReadLength(), - r.getMateReferenceIndex(), r.getMateAlignmentStart(), r.getInferredInsertSize(), - new byte[0]); - this.setReadBases(r.getReadBases()); - this.setBaseQualities(r.getBaseQualities()); - this.setReadName(r.getReadName()); - } - - ExponentialDistribution indelLengthDist = MathUtils.exponentialDistribution(1.0 / 0.9); - - public MyGATKSAMRecord(final GATKSAMRecord r, final Random rnd) { - super(r.getHeader(), r.getReferenceIndex(), r.getAlignmentStart(), (short) r.getReadNameLength(), - (short) 100, -1, r.getCigarLength(), r.getFlags(), r.getReadLength(), - r.getMateReferenceIndex(), r.getMateAlignmentStart(), r.getInferredInsertSize(), - new byte[0]); - final byte[] bases = new byte[r.getReadBases().length]; - - final byte[] readBases = r.getReadBases(); - final byte[] bq = r.getBaseQualities(); - final byte[] iq = r.getBaseInsertionQualities(); - final byte[] dq = r.getBaseDeletionQualities(); - int refOffset = r.getAlignmentStart() - 1; - int readOffset = 0; - for (int i = 0; i < r.getReadBases().length;) { - double p = rnd.nextDouble(); - double iqp = QualityUtils.qualToErrorProb(iq[i]); - if (p < iqp) { // insertion - final int length = Math.min(generateIndelLength(rnd),r.getReadBases().length - i); - final int refStart = rnd.nextInt(reference.length() - length); - System.arraycopy(referenceBytes,refStart,bases,i,length); - i += length; - continue; - } - p -= iqp; - double dqp = QualityUtils.qualToErrorProb(dq[i]); - if (p < dqp) { - final int length = generateIndelLength(rnd); - refOffset += length; - refOffset = refOffset % referenceBytes.length; - readOffset += length; - continue; - } - p -= dqp; - double bqp = QualityUtils.qualToErrorProb(bq[i]); - byte b = readOffset < readBases.length ? readBases[readOffset] : referenceBytes[refOffset]; - byte nb; - if (p < bqp) { - switch (b) { - case 'A': nb = 'C'; break; - case 'T': nb = 'A'; break; - case 'C': nb = 'G'; break; - case 'G': nb = 'B'; break; - default: nb = 'A'; - } - } else - nb = b; - - bases[i++] = nb; - refOffset++; - refOffset = refOffset % referenceBytes.length; - readOffset++; - } - this.setReadBases(bases); - this.setBaseQualities(r.getBaseQualities()); - this.setReadName(r.getReadName()); - - - } - - private int generateIndelLength(final Random rnd) { - final int length; - try { - length = (int) Math.round(indelLengthDist.inverseCumulativeProbability(rnd.nextDouble()) + 1); - } catch (Exception e) { - throw new RuntimeException(e); - } - return length; - } - - @Override - public byte[] getBaseDeletionQualities() { - return Arrays.copyOf(dq,getReadLength()); - } - - @Override - public byte[] getBaseInsertionQualities() { - return Arrays.copyOf(iq,getReadLength()); - } - - @Override - public int getMappingQuality() { - return 100; - } - - @Override - public int hashCode() { - return getReadName().hashCode(); - } - - @Override - public boolean equals(Object o) { - if (o instanceof GATKSAMRecord) { - return getReadName().equals(((GATKSAMRecord)o).getReadName()); - } else { - return false; - } - } - - public String toString() { - return super.toString() + " " + this.getReadString(); - } - } - - - public List readStrings() { - final List result = new ArrayList<>(readCigars.length); - final List haplotypes = haplotypesStrings(); - for (final String descr : readCigars) { - String sequence; - if (descr.matches("^\\d+:\\d+:.+$")) { - final String[] parts = descr.split(":"); - int allele = Integer.valueOf(parts[0]); - int offset = Integer.valueOf(parts[1]); - final String cigar = parts[2]; - final String base = allele == 0 ? reference : haplotypes.get(allele - 1); - sequence = applyCigar(base, cigar, offset, false); - result.add(sequence); - } else if (descr.matches("\\*:^\\d+:\\d+")) { - int readCount = Integer.valueOf(descr.split(":")[1]); - int readLength = Integer.valueOf(descr.split(":")[2]); - result.addAll(generateReads(haplotypes, readCount, readLength)); - } else { - sequence = descr; - result.add(sequence); - } - } - return result; - } - - private List generateReads(final List haplotypes, final int readCount, final int readLength) { - final List result = new ArrayList<>(readCount); - for (int i = 0; i < readCount; i++) { - int hi = i % haplotypes.size(); - final String h = haplotypes.get(hi); - int offset = i % h.length() - readLength; - result.add(h.substring(offset,offset + readLength)); - } - return result; - } - - private List generateSamRecords(final List haplotypes, final int readCount, final int readLength, final SAMFileHeader header, final int idStart) { - int id = idStart; - final List result = new ArrayList<>(readCount); - for (int i = 0; i < readCount; i++) { - int hi = i % haplotypes.size(); - final String h = haplotypes.get(hi); - int offset = h.length() <= readLength ? 0 : i % (h.length() - readLength); - int to = Math.min(h.length(),offset + readLength); - byte[] bases = h.substring(offset,to).getBytes(); - byte[] quals = Arrays.copyOf(bq,to - offset); - final GATKSAMRecord samRecord = ArtificialSAMUtils.createArtificialRead(header,"read_" + id++,0,offset + 1,bases, quals); - result.add(new MyGATKSAMRecord(samRecord)); - } - return result; - } - - - private List> generateElementOffsetRecords(final List haplotypes, final List unrolledCivars, final int readCount, final int readLength, final int count) { - - final List> result = new ArrayList<>(readCount); - for (int i = 0; i < readCount; i++) { - int hi = i % unrolledCivars.size(); - final Civar c = unrolledCivars.get(hi); - final String h = haplotypes.get(hi); - int offset = h.length() <= readLength ? 0 : i % (h.length() - readLength); - int to = Math.min(h.length(),offset + readLength); - result.add(c.eventOffsets(reference,offset,to)); - } - return result; - } - - private static final Pattern cigarPattern = Pattern.compile("(\\d+)([=A-Z])"); - - - private Haplotype cigarToHaplotype(final String reference, final String cigar, final int offset, final boolean global) { - final String sequence = applyCigar(reference,cigar,offset,global); - final Haplotype haplotype = new Haplotype(sequence.getBytes(),reference.equals(sequence)); - haplotype.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,reference.length())); - haplotype.setCigar(Civar.fromCharSequence(cigar).toCigar(reference.length())); - return haplotype; - } - - private String applyCigar(final String reference, final String cigar, final int offset, final boolean global) { - final Matcher pm = cigarPattern.matcher(cigar); - StringBuffer sb = new StringBuffer(); - int index = offset; - while (pm.find()) { - int length = Integer.valueOf(pm.group(1)); - char operator = pm.group(2).charAt(0); - switch (operator) { - case '=' : - try { - sb.append(reference.substring(index, index + length)); - } catch (Exception e) { - throw new RuntimeException(" " + index + " " + (index + length) + " " + reference.length() + " " + cigar,e); - } - index += length; break; - case 'D' : - index += length; break; - case 'I' : - String insert = cigar.substring(pm.end(),pm.end() + length).toUpperCase(); - sb.append(insert); break; - case 'V' : - sb.append(transversionV(reference.charAt(index))); index++; break; - case 'W' : - sb.append(transversionW(reference.charAt(index))); index++; break; - case 'T' : - sb.append(transition(reference.charAt(index))); index++; break; - default: - throw new UnsupportedOperationException("cigar operator " + operator + " not supported."); - } - } - if (global && index != reference.length()) { - throw new RuntimeException(" haplotype cigar does not explain reference length (" + index + " != " + reference.length() + ") on cigar " + cigar); - } else if (index > reference.length()) { - throw new RuntimeException(" index beyond end "); - } - return sb.toString(); - } - - protected int kmerSize() { - return kmerSize; - } - - private char transversionV(final char c) { - switch (Character.toUpperCase(c)) { - case 'A': return 'C'; - case 'G': return 'T'; - case 'C': return 'A'; - case 'T': return 'G'; - default: - return c; - } - - } - - private char transversionW(final char c) { - switch (Character.toUpperCase(c)) { - case 'A': return 'T'; - case 'G': return 'C'; - case 'T': return 'A'; - case 'C': return 'G'; - default: - return c; - } - - } - - private char transition(final char c) { - switch (Character.toUpperCase(c)) { - case 'A': return 'G'; - case 'G': return 'A'; - case 'T': return 'C'; - case 'C': return 'T'; - default: - return c; - } - - } -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/FastLoglessPairHMMUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/FastLoglessPairHMMUnitTest.java deleted file mode 100644 index 621ef7b1f..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/FastLoglessPairHMMUnitTest.java +++ /dev/null @@ -1,182 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.pairhmm; - -import org.broadinstitute.gatk.tools.walkers.haplotypecaller.ActiveRegionTestDataSetUnitTest; -import org.broadinstitute.gatk.utils.haplotype.Haplotype; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.util.*; - - -/** - * Created with IntelliJ IDEA. - * User: valentin - * Date: 10/13/13 - * Time: 12:55 PM - * To change this template use File | Settings | File Templates. - */ -public class FastLoglessPairHMMUnitTest extends ActiveRegionTestDataSetUnitTest { - - private FastLoglessPairHMM unsorted = new FastLoglessPairHMM((byte)10); - private FastLoglessPairHMM sorted = new FastLoglessPairHMM((byte)10); - - @Test(enabled=false,dataProvider="activeRegionTestDataSets") - public void testActiveRegionsDataSet(final ActiveRegionTestDataSet as, final int kmerSize, final int readLength, final String variation, final int readCount, final int regionSize, final byte bq, final byte iq, final byte dq) { - - } - - @Test(enabled=true,dataProvider="activeRegionTestDataSets") - public void testHaplotypeGrouped(final ActiveRegionTestDataSet as, final int kmerSize, final int readLength, final String variation, final int readCount, final int regionSize, final byte bq, final byte iq, final byte dq) { - final List reads = as.readList(); - final List haplotypes = as.haplotypeList(); - PairHMMReadyHaplotypes haplotypeCollection = new PairHMMReadyHaplotypes(haplotypes.size()); - final List sortedHaplotypes = new ArrayList<>(haplotypes); - Collections.sort(sortedHaplotypes, HAPLOTYPE_COMPARATOR); - Map basesToPos = new HashMap<>(sortedHaplotypes.size()); - int nextIdx = 0; - - for (final Haplotype h : sortedHaplotypes) { - final byte[] bases = h.getBases(); - haplotypeCollection.add(bases); - basesToPos.put(bases,nextIdx++); - } - for (GATKSAMRecord read : reads) { - final double[] unsortedLikelihoods = new double[sortedHaplotypes.size()]; - final double[] sortedLikelihoods = new double[sortedHaplotypes.size()]; - unsorted.loadRead(read); - sorted.loadRead(read); - final Map unsortedResults = new HashMap<>(haplotypes.size()); - for (int i = 0; i < sortedHaplotypes.size(); i++) { - final Haplotype h = sortedHaplotypes.get(i); - final byte[] haplotypeBases = h.getBases().clone(); - unsorted.loadHaplotypeBases(haplotypeBases); - double lk = unsorted.calculateLocalLikelihood(0, read.getReadLength(), 0, haplotypeBases.length, false); - unsortedLikelihoods[i] = lk; - } - sorted.calculateLocalLikelihoods(0, read.getReadLength(), haplotypeCollection); - for (final PairHMMReadyHaplotypes.Entry entry : haplotypeCollection) { - final byte[] bases = entry.getBases(); - final double lk = entry.getLikelihood(); - final int haplotypePos = basesToPos.get(bases); - sortedLikelihoods[haplotypePos] = lk; - } - for (int i = 0; i < unsortedLikelihoods.length; i++) - Assert.assertEquals(unsortedLikelihoods[i],sortedLikelihoods[i],0.00000001,Arrays.toString(unsortedLikelihoods) + Arrays.toString(sortedLikelihoods)); - } - } - - @Test(enabled=true,dataProvider="activeRegionTestDataSets") - public void testSortedVsUnsorted(final ActiveRegionTestDataSet as, final int kmerSize, final int readLength, final String variation, final int readCount, final int regionSize, final byte bq, final byte iq, final byte dq) { - final List reads = as.readList(); - final List haplotypes = as.haplotypeList(); - final List sortedHaplotypes = new ArrayList<>(haplotypes); - Collections.sort(sortedHaplotypes, HAPLOTYPE_COMPARATOR); - - byte[] lastHaplotypeBases = null; - for (GATKSAMRecord read : reads) { - final double[] unsortedLikelihoods = new double[sortedHaplotypes.size()]; - final double[] sortedLikelihoods = new double[sortedHaplotypes.size()]; - unsorted.loadRead(read); - sorted.loadRead(read); - for (int i = 0; i < sortedHaplotypes.size(); i++) { - final Haplotype h = sortedHaplotypes.get(i); - final byte[] haplotypeBases = h.getBases().clone(); - final byte[] haplotypeBases2 = haplotypeBases.clone(); - int commonPrefixEnd = 0; - - - if (lastHaplotypeBases != null) { - final int prefixEndLimit = Math.min(lastHaplotypeBases.length,haplotypeBases.length); - for (commonPrefixEnd = 0; commonPrefixEnd < prefixEndLimit; commonPrefixEnd++) - if (lastHaplotypeBases[commonPrefixEnd] != haplotypeBases[commonPrefixEnd]) - break; - } - - unsorted.loadHaplotypeBases(haplotypeBases); - sorted.changeHaplotypeSuffix(commonPrefixEnd, haplotypeBases, commonPrefixEnd, haplotypeBases.length); - Assert.assertTrue(Arrays.equals(haplotypeBases2, unsorted.getHaplotypeBases())); - Assert.assertTrue(Arrays.equals(haplotypeBases2, sorted.getHaplotypeBases())); - unsortedLikelihoods[i] = unsorted.calculateLocalLikelihood(0, read.getReadLength(), 0, haplotypeBases.length, false); - sortedLikelihoods[i] = sorted.calculateLocalLikelihood(0, read.getReadLength(), 0, haplotypeBases.length, false); - Assert.assertTrue(Arrays.equals(haplotypeBases2,unsorted.getHaplotypeBases())); - Assert.assertTrue(Arrays.equals(haplotypeBases2,sorted.getHaplotypeBases())); - Assert.assertEquals((double)unsortedLikelihoods[i], (double) sortedLikelihoods[i],0.00000001); - lastHaplotypeBases = haplotypeBases; - } - } - } - - public static final Comparator HAPLOTYPE_COMPARATOR = new Comparator() { - - @Override - public int compare(final Haplotype o1, final Haplotype o2) { - if (o1 == o2) - return 0; - final byte[] bases1 = o1.getBases(); - final byte[] bases2 = o2.getBases(); - final int ilimit = Math.min(bases1.length,bases2.length); - for (int i = 0; i < ilimit; i++) { - final int cmp = Byte.compare(bases1[i],bases2[i]); - if (cmp != 0) return cmp; - } - if (bases1.length == bases2.length) return 0; - return (bases1.length > bases2.length) ? -1 : 1; // is a bit better to get the longest haplotypes first. - } - }; - - - - -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMProbabilityBugIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMProbabilityBugIntegrationTest.java deleted file mode 100644 index b7d0b037e..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMProbabilityBugIntegrationTest.java +++ /dev/null @@ -1,92 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.pairhmm; - -import org.broadinstitute.gatk.engine.walkers.WalkerTest; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.Arrays; - -/** - * Test for the Prob > 1 bug in PairHMM using callers. - * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> - */ -public class PairHMMProbabilityBugIntegrationTest extends WalkerTest { - - private static final File REFERENCE = new File("/humgen/gsa-hpprojects/GATK/bundle/current/hg19/ucsc.hg19.fasta").getAbsoluteFile(); - private static final File BAM = new File (privateTestDir, "pairhmm_prob_bug.bam").getAbsoluteFile(); - private static final File INTERVAL = new File (privateTestDir, "pairhmm_prob_bug.interval.bed").getAbsoluteFile(); - - private static final File UG_BAM = new File(privateTestDir, "pairhmm_prob_bug.ug.bam").getAbsoluteFile(); - private static final File UG_INTERVAL = new File(privateTestDir, "pairhmm_prob_bug.ug.intervals.bed").getAbsoluteFile(); - - - @Test - public void testHaplotypeCaller() { - final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s", - REFERENCE,BAM,INTERVAL); - final String name = getClass().getSimpleName() + ".testHaplotypeCaller"; - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); - executeTest(name, spec); - } - - @Test - public void testUnifiedGenotyper() { - final String commandLine = String.format("-T UnifiedGenotyper -R %s -I %s -L %s -dcov 200 -glm INDEL", - REFERENCE,UG_BAM,UG_INTERVAL); - final String name = getClass().getSimpleName() + ".testUnifiedGenotyper"; - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); - executeTest(name, spec); - } -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMUnitTest.java index 5643e3030..1c34405b0 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMUnitTest.java @@ -55,7 +55,6 @@ package org.broadinstitute.gatk.utils.pairhmm; // the imports for unit testing. import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.QualityUtils; @@ -252,8 +251,8 @@ public class PairHMMUnitTest extends BaseTest { @DataProvider(name = "OptimizedLikelihoodTestProvider") public Object[][] makeOptimizedLikelihoodTests() { - GenomeAnalysisEngine.resetRandomGenerator(); - final Random random = GenomeAnalysisEngine.getRandomGenerator(); + Utils.resetRandomGenerator(); + final Random random = Utils.getRandomGenerator(); final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 30, 40, 60) : Arrays.asList(30); final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 40, 60) : Arrays.asList(40); final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/ContextCovariateUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/ContextCovariateUnitTest.java deleted file mode 100644 index 188902bb5..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/ContextCovariateUnitTest.java +++ /dev/null @@ -1,122 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.recalibration.covariates.ContextCovariate; -import org.broadinstitute.gatk.utils.recalibration.covariates.Covariate; -import org.broadinstitute.gatk.utils.clipping.ClippingRepresentation; -import org.broadinstitute.gatk.utils.clipping.ReadClipper; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -/** - * @author Mauricio Carneiro - * @since 3/1/12 - */ -public class ContextCovariateUnitTest { - ContextCovariate covariate; - RecalibrationArgumentCollection RAC; - - @BeforeClass - public void init() { - RAC = new RecalibrationArgumentCollection(); - covariate = new ContextCovariate(); - covariate.initialize(RAC); - } - - @BeforeMethod - public void initCache() { - ReadCovariates.clearKeysCache(); - } - - @Test(enabled = true) - public void testSimpleContexts() { - GATKSAMRecord read = ReadUtils.createRandomRead(1000); - GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, RAC.LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); - ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); - covariate.recordValues(read, readCovariates); - - verifyCovariateArray(readCovariates.getMismatchesKeySet(), RAC.MISMATCHES_CONTEXT_SIZE, clippedRead, covariate); - verifyCovariateArray(readCovariates.getInsertionsKeySet(), RAC.INDELS_CONTEXT_SIZE, clippedRead, covariate); - verifyCovariateArray(readCovariates.getDeletionsKeySet(), RAC.INDELS_CONTEXT_SIZE, clippedRead, covariate); - } - - public static void verifyCovariateArray(int[][] values, int contextSize, GATKSAMRecord read, Covariate contextCovariate) { - for (int i = 0; i < values.length; i++) - Assert.assertEquals(contextCovariate.formatKey(values[i][0]), expectedContext(read, i, contextSize)); - - } - - public static String expectedContext (GATKSAMRecord read, int offset, int contextSize) { - final String bases = stringFrom(read.getReadBases()); - String expectedContext = null; - if (offset - contextSize + 1 >= 0) { - String context = bases.substring(offset - contextSize + 1, offset + 1); - if (!context.contains("N")) - expectedContext = context; - } - return expectedContext; - } - - private static String stringFrom(byte[] array) { - String s = ""; - for (byte value : array) - s += (char) value; - return s; - } - -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/CycleCovariateUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/CycleCovariateUnitTest.java deleted file mode 100644 index 316c28374..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/CycleCovariateUnitTest.java +++ /dev/null @@ -1,141 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.recalibration.covariates.CycleCovariate; -import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -/** - * @author Mauricio Carneiro - * @since 3/1/12 - */ -public class CycleCovariateUnitTest { - CycleCovariate covariate; - RecalibrationArgumentCollection RAC; - - @BeforeClass - public void init() { - RAC = new RecalibrationArgumentCollection(); - covariate = new CycleCovariate(); - covariate.initialize(RAC); - } - - @BeforeMethod - public void initCache() { - ReadCovariates.clearKeysCache(); - } - - @Test(enabled = true) - public void testSimpleCycles() { - short readLength = 10; - GATKSAMRecord read = ReadUtils.createRandomRead(readLength); - read.setReadPairedFlag(true); - read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); - read.getReadGroup().setPlatform("illumina"); - - ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); - covariate.recordValues(read, readCovariates); - verifyCovariateArray(readCovariates.getMismatchesKeySet(), 1, (short) 1); - - read.setReadNegativeStrandFlag(true); - covariate.recordValues(read, readCovariates); - verifyCovariateArray(readCovariates.getMismatchesKeySet(), readLength, -1); - - read.setSecondOfPairFlag(true); - covariate.recordValues(read, readCovariates); - verifyCovariateArray(readCovariates.getMismatchesKeySet(), -readLength, 1); - - read.setReadNegativeStrandFlag(false); - covariate.recordValues(read, readCovariates); - verifyCovariateArray(readCovariates.getMismatchesKeySet(), -1, -1); - } - - private void verifyCovariateArray(int[][] values, int init, int increment) { - for (short i = 0; i < values.length; i++) { - short actual = Short.decode(covariate.formatKey(values[i][0])); - int expected = init + (increment * i); - Assert.assertEquals(actual, expected); - } - } - - @Test(enabled = true, expectedExceptions={UserException.class}) - public void testMoreThanMaxCycleFails() { - int readLength = RAC.MAXIMUM_CYCLE_VALUE + 1; - GATKSAMRecord read = ReadUtils.createRandomRead(readLength); - read.setReadPairedFlag(true); - read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); - read.getReadGroup().setPlatform("illumina"); - - ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); - covariate.recordValues(read, readCovariates); - } - - @Test(enabled = true) - public void testMaxCyclePasses() { - int readLength = RAC.MAXIMUM_CYCLE_VALUE; - GATKSAMRecord read = ReadUtils.createRandomRead(readLength); - read.setReadPairedFlag(true); - read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); - read.getReadGroup().setPlatform("illumina"); - - ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); - covariate.recordValues(read, readCovariates); - } -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/QualQuantizerUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/QualQuantizerUnitTest.java deleted file mode 100644 index 90bad890e..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/QualQuantizerUnitTest.java +++ /dev/null @@ -1,195 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - - -// the imports for unit testing. - - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.Utils; -import org.testng.Assert; -import org.testng.annotations.BeforeSuite; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - - -public class QualQuantizerUnitTest extends BaseTest { - @BeforeSuite - public void before() { - - } - - // -------------------------------------------------------------------------------- - // - // merge case Provider - // - // -------------------------------------------------------------------------------- - - private class QualIntervalTestProvider extends TestDataProvider { - final QualQuantizer.QualInterval left, right; - int exError, exTotal, exQual; - double exErrorRate; - - private QualIntervalTestProvider(int leftE, int leftN, int rightE, int rightN, int exError, int exTotal) { - super(QualIntervalTestProvider.class); - - QualQuantizer qq = new QualQuantizer(0); - left = qq.new QualInterval(10, 10, leftN, leftE, 0); - right = qq.new QualInterval(11, 11, rightN, rightE, 0); - - this.exError = exError; - this.exTotal = exTotal; - this.exErrorRate = (leftE + rightE + 1) / (1.0 * (leftN + rightN + 1)); - this.exQual = QualityUtils.errorProbToQual(this.exErrorRate); - } - } - - @DataProvider(name = "QualIntervalTestProvider") - public Object[][] makeQualIntervalTestProvider() { - new QualIntervalTestProvider(10, 100, 10, 1000, 20, 1100); - new QualIntervalTestProvider(0, 100, 10, 900, 10, 1000); - new QualIntervalTestProvider(10, 900, 0, 100, 10, 1000); - new QualIntervalTestProvider(0, 0, 10, 100, 10, 100); - new QualIntervalTestProvider(1, 10, 9, 90, 10, 100); - new QualIntervalTestProvider(1, 10, 9, 100000, 10, 100010); - new QualIntervalTestProvider(1, 10, 9, 1000000, 10,1000010); - - return QualIntervalTestProvider.getTests(QualIntervalTestProvider.class); - } - - @Test(dataProvider = "QualIntervalTestProvider") - public void testQualInterval(QualIntervalTestProvider cfg) { - QualQuantizer.QualInterval merged = cfg.left.merge(cfg.right); - Assert.assertEquals(merged.nErrors, cfg.exError); - Assert.assertEquals(merged.nObservations, cfg.exTotal); - Assert.assertEquals(merged.getErrorRate(), cfg.exErrorRate); - Assert.assertEquals(merged.getQual(), cfg.exQual); - } - - @Test - public void testMinInterestingQual() { - for ( int q = 0; q < 15; q++ ) { - for ( int minQual = 0; minQual <= 10; minQual ++ ) { - QualQuantizer qq = new QualQuantizer(minQual); - QualQuantizer.QualInterval left = qq.new QualInterval(q, q, 100, 10, 0); - QualQuantizer.QualInterval right = qq.new QualInterval(q+1, q+1, 1000, 100, 0); - - QualQuantizer.QualInterval merged = left.merge(right); - boolean shouldBeFree = q+1 <= minQual; - if ( shouldBeFree ) - Assert.assertEquals(merged.getPenalty(), 0.0); - else - Assert.assertTrue(merged.getPenalty() > 0.0); - } - } - } - - - // -------------------------------------------------------------------------------- - // - // High-level case Provider - // - // -------------------------------------------------------------------------------- - - private class QuantizerTestProvider extends TestDataProvider { - final List nObservationsPerQual = new ArrayList(); - final int nLevels; - final List expectedMap; - - private QuantizerTestProvider(final List nObservationsPerQual, final int nLevels, final List expectedMap) { - super(QuantizerTestProvider.class); - - for ( int x : nObservationsPerQual ) - this.nObservationsPerQual.add((long)x); - this.nLevels = nLevels; - this.expectedMap = expectedMap; - } - - @Override - public String toString() { - return String.format("QQTest nLevels=%d nObs=[%s] map=[%s]", - nLevels, Utils.join(",", nObservationsPerQual), Utils.join(",", expectedMap)); - } - } - - @DataProvider(name = "QuantizerTestProvider") - public Object[][] makeQuantizerTestProvider() { - List allQ2 = Arrays.asList(0, 0, 1000, 0, 0); - - new QuantizerTestProvider(allQ2, 5, Arrays.asList(0, 1, 2, 3, 4)); - new QuantizerTestProvider(allQ2, 1, Arrays.asList(2, 2, 2, 2, 2)); - - new QuantizerTestProvider(Arrays.asList(0, 0, 1000, 0, 1000), 2, Arrays.asList(2, 2, 2, 2, 4)); - new QuantizerTestProvider(Arrays.asList(0, 0, 1000, 1, 1000), 2, Arrays.asList(2, 2, 2, 4, 4)); - new QuantizerTestProvider(Arrays.asList(0, 0, 1000, 10, 1000), 2, Arrays.asList(2, 2, 2, 2, 4)); - - return QuantizerTestProvider.getTests(QuantizerTestProvider.class); - } - - @Test(dataProvider = "QuantizerTestProvider", enabled = true) - public void testQuantizer(QuantizerTestProvider cfg) { - QualQuantizer qq = new QualQuantizer(cfg.nObservationsPerQual, cfg.nLevels, 0); - logger.warn("cfg: " + cfg); - for ( int i = 0; i < cfg.expectedMap.size(); i++) { - int expected = cfg.expectedMap.get(i); - int observed = qq.originalToQuantizedMap.get(i); - //logger.warn(String.format(" qq map: %s : %d => %d", i, expected, observed)); - Assert.assertEquals(observed, expected); - } - } -} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/ReadCovariatesUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/ReadCovariatesUnitTest.java deleted file mode 100644 index b765e4d5b..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/ReadCovariatesUnitTest.java +++ /dev/null @@ -1,148 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.recalibration.covariates.*; -import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -import java.util.Random; - -/** - * @author carneiro - * @since 4/21/12 - */ -public class ReadCovariatesUnitTest { - - @BeforeMethod - public void init() { - ReadCovariates.clearKeysCache(); - } - - @Test(enabled = false) - public void testCovariateGeneration() { - final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); - final String RGID = "id"; - - ReadGroupCovariate rgCov = new ReadGroupCovariate(); - QualityScoreCovariate qsCov = new QualityScoreCovariate(); - ContextCovariate coCov = new ContextCovariate(); - CycleCovariate cyCov = new CycleCovariate(); - - rgCov.initialize(RAC); - qsCov.initialize(RAC); - coCov.initialize(RAC); - cyCov.initialize(RAC); - - Covariate[] requestedCovariates = new Covariate[4]; - requestedCovariates[0] = rgCov; - requestedCovariates[1] = qsCov; - requestedCovariates[2] = coCov; - requestedCovariates[3] = cyCov; - - final int NUM_READS = 100; - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); - - final String[] readGroups = {"RG1", "RG2", "RGbla"}; - for (int idx = 0; idx < NUM_READS; idx++) { - for (final String rgs : readGroups) { - final int length = 10 + rnd.nextInt(100); // random read length, at least 10 bp long - final GATKSAMRecord read = ReadUtils.createRandomRead(length, false); - final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(rgs); - rg.setPlatform("illumina"); - read.setReadGroup(rg); - read.setReadNegativeStrandFlag(rnd.nextBoolean()); - final byte[] mQuals = read.getBaseQualities(EventType.BASE_SUBSTITUTION); - final byte[] iQuals = read.getBaseQualities(EventType.BASE_INSERTION); - final byte[] dQuals = read.getBaseQualities(EventType.BASE_DELETION); - ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); - - // check that the length is correct - Assert.assertEquals(rc.getMismatchesKeySet().length, length); - Assert.assertEquals(rc.getInsertionsKeySet().length, length); - Assert.assertEquals(rc.getDeletionsKeySet().length, length); - - for (int i = 0; i < length; i++) { - // check that read group is always the same - Assert.assertEquals(rgCov.formatKey(rc.getMismatchesKeySet(i)[0]), rgs); - Assert.assertEquals(rgCov.formatKey(rc.getInsertionsKeySet(i)[0]), rgs); - Assert.assertEquals(rgCov.formatKey(rc.getDeletionsKeySet(i)[0]), rgs); - - // check quality score - Assert.assertEquals(qsCov.formatKey(rc.getMismatchesKeySet(i)[1]), "" + mQuals[i]); - Assert.assertEquals(qsCov.formatKey(rc.getInsertionsKeySet(i)[1]), "" + iQuals[i]); - Assert.assertEquals(qsCov.formatKey(rc.getDeletionsKeySet(i)[1]), "" + dQuals[i]); - - // check context - Assert.assertEquals(coCov.formatKey(rc.getMismatchesKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.MISMATCHES_CONTEXT_SIZE)); - Assert.assertEquals(coCov.formatKey(rc.getInsertionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE)); - Assert.assertEquals(coCov.formatKey(rc.getDeletionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE)); - - // check cycle - Assert.assertEquals(cyCov.formatKey(rc.getMismatchesKeySet(i)[3]), "" + (i+1)); - Assert.assertEquals(cyCov.formatKey(rc.getInsertionsKeySet(i)[3]), "" + (i+1)); - Assert.assertEquals(cyCov.formatKey(rc.getDeletionsKeySet(i)[3]), "" + (i+1)); - } - - } - - } - - } - -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/ReadGroupCovariateUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/ReadGroupCovariateUnitTest.java deleted file mode 100644 index 47bbf38a4..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/ReadGroupCovariateUnitTest.java +++ /dev/null @@ -1,126 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.recalibration.covariates.ReadGroupCovariate; -import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -/** - * @author Mauricio Carneiro - * @since 3/1/12 - */ -public class ReadGroupCovariateUnitTest { - ReadGroupCovariate covariate; - RecalibrationArgumentCollection RAC; - - @BeforeClass - public void init() { - RAC = new RecalibrationArgumentCollection(); - covariate = new ReadGroupCovariate(); - covariate.initialize(RAC); - } - - @BeforeMethod - public void initCache() { - ReadCovariates.clearKeysCache(); - } - - @Test(enabled = true) - public void testSingleRecord() { - final String expected = "SAMPLE.1"; - GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.ID"); - rg.setPlatformUnit(expected); - runTest(rg, expected, covariate); - } - - @Test(enabled = true) - public void testMissingPlatformUnit() { - final String expected = "MY.7"; - GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(expected); - runTest(rg, expected, covariate); - } - - @Test(enabled = true) - public void testForceReadgroup() { - final RecalibrationArgumentCollection forcedRAC = new RecalibrationArgumentCollection(); - forcedRAC.FORCE_READGROUP = "FOO"; - final ReadGroupCovariate forcedCovariate = new ReadGroupCovariate(); - forcedCovariate.initialize(forcedRAC); - - final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("NOT_FOO"); - runTest(rg, "FOO", forcedCovariate); - } - - private static void runTest(final GATKSAMReadGroupRecord rg, final String expected, final ReadGroupCovariate covariate) { - GATKSAMRecord read = ReadUtils.createRandomRead(10); - read.setReadGroup(rg); - ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); - covariate.recordValues(read, readCovariates); - verifyCovariateArray(readCovariates.getMismatchesKeySet(), expected, covariate); - - } - - private static void verifyCovariateArray(final int[][] values, final String expected, final ReadGroupCovariate covariate) { - for (int[] value : values) { - String actual = covariate.formatKey(value[0]); - Assert.assertEquals(actual, expected); - } - } - -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalDatumUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalDatumUnitTest.java deleted file mode 100644 index 3c3842f70..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalDatumUnitTest.java +++ /dev/null @@ -1,313 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - - -// the imports for unit testing. - - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.Arrays; -import java.util.Collections; - - -public class RecalDatumUnitTest extends BaseTest { - - // -------------------------------------------------------------------------------- - // - // merge case Provider - // - // -------------------------------------------------------------------------------- - - private class RecalDatumTestProvider extends TestDataProvider { - int exError, exTotal, reportedQual; - - private RecalDatumTestProvider(int E, int N, int reportedQual) { - super(RecalDatumTestProvider.class); - - this.exError = E; - this.exTotal = N; - this.reportedQual = reportedQual; - } - - public double getErrorRate() { - return (exError + 1) / (1.0 * (exTotal + 2)); - } - - public double getErrorRatePhredScaled() { - return QualityUtils.phredScaleErrorRate(getErrorRate()); - } - - public int getReportedQual() { - return reportedQual; - } - - public RecalDatum makeRecalDatum() { - return new RecalDatum((long)exTotal, (double)exError, (byte)getReportedQual()); - } - - @Override - public String toString() { - return String.format("exError=%d, exTotal=%d, reportedQual=%d", exError, exTotal, reportedQual); - } - } - - private static boolean createdDatumTestProviders = false; - - @DataProvider(name = "RecalDatumTestProvider") - public Object[][] makeRecalDatumTestProvider() { - if ( !createdDatumTestProviders ) { - for ( int E : Arrays.asList(1, 10, 100, 1000, 10000) ) - for ( int N : Arrays.asList(10, 100, 1000, 10000, 100000, 1000000) ) - for ( int reportedQual : Arrays.asList(10, 20) ) - if ( E <= N ) - new RecalDatumTestProvider(E, N, reportedQual); - createdDatumTestProviders = true; - } - - return RecalDatumTestProvider.getTests(RecalDatumTestProvider.class); - } - - @Test(dataProvider = "RecalDatumTestProvider") - public void testRecalDatumBasics(RecalDatumTestProvider cfg) { - final RecalDatum datum = cfg.makeRecalDatum(); - assertBasicFeaturesOfRecalDatum(datum, cfg); - } - - private static void assertBasicFeaturesOfRecalDatum(final RecalDatum datum, final RecalDatumTestProvider cfg) { - Assert.assertEquals(datum.getNumMismatches(), cfg.exError, 1E-6); - Assert.assertEquals(datum.getNumObservations(), cfg.exTotal, 1E-6); - if ( cfg.getReportedQual() != -1 ) - Assert.assertEquals(datum.getEstimatedQReportedAsByte(), cfg.getReportedQual()); - BaseTest.assertEqualsDoubleSmart(datum.getEmpiricalErrorRate(), cfg.getErrorRate()); - - final double e = datum.getEmpiricalQuality(); - Assert.assertTrue(datum.getEmpiricalQualityAsByte() >= Math.floor(e)); - Assert.assertTrue(datum.getEmpiricalQualityAsByte() <= Math.ceil(e)); - Assert.assertNotNull(datum.toString()); - } - - @Test(dataProvider = "RecalDatumTestProvider") - public void testRecalDatumCopyAndCombine(RecalDatumTestProvider cfg) { - final RecalDatum datum = cfg.makeRecalDatum(); - final RecalDatum copy = new RecalDatum(datum); - assertBasicFeaturesOfRecalDatum(copy, cfg); - - RecalDatumTestProvider combinedCfg = new RecalDatumTestProvider(cfg.exError * 2, cfg.exTotal * 2, cfg.reportedQual); - copy.combine(datum); - assertBasicFeaturesOfRecalDatum(copy, combinedCfg); - } - - @Test(dataProvider = "RecalDatumTestProvider") - public void testRecalDatumModification(RecalDatumTestProvider cfg) { - RecalDatum datum = cfg.makeRecalDatum(); - datum.setEmpiricalQuality(10.1); - Assert.assertEquals(datum.getEmpiricalQuality(), 10.1); - - datum.setEstimatedQReported(10.1); - Assert.assertEquals(datum.getEstimatedQReported(), 10.1); - Assert.assertEquals(datum.getEstimatedQReportedAsByte(), 10); - - datum = cfg.makeRecalDatum(); - cfg.exTotal = 100000; - datum.setNumObservations(cfg.exTotal); - assertBasicFeaturesOfRecalDatum(datum, cfg); - - datum = cfg.makeRecalDatum(); - cfg.exError = 1000; - datum.setNumMismatches(cfg.exError); - assertBasicFeaturesOfRecalDatum(datum, cfg); - - datum = cfg.makeRecalDatum(); - datum.increment(true); - cfg.exError++; - cfg.exTotal++; - assertBasicFeaturesOfRecalDatum(datum, cfg); - - datum = cfg.makeRecalDatum(); - datum.increment(false); - cfg.exTotal++; - assertBasicFeaturesOfRecalDatum(datum, cfg); - - datum = cfg.makeRecalDatum(); - datum.incrementNumObservations(2); - cfg.exTotal += 2; - assertBasicFeaturesOfRecalDatum(datum, cfg); - - datum = cfg.makeRecalDatum(); - datum.incrementNumMismatches(2); - cfg.exError += 2; - assertBasicFeaturesOfRecalDatum(datum, cfg); - - - datum = cfg.makeRecalDatum(); - datum.increment(10, 5); - cfg.exError += 5; - cfg.exTotal += 10; - assertBasicFeaturesOfRecalDatum(datum, cfg); - } - - @Test - public void testNoObs() { - final RecalDatum rd = new RecalDatum(0L, 0.0, (byte)10); - Assert.assertEquals(rd.getEmpiricalErrorRate(), 0.0); - } - - @Test - public void testlog10QempPrior() { - for ( int Qemp = 0; Qemp <= QualityUtils.MAX_SAM_QUAL_SCORE; Qemp++ ) { - for ( int Qrep = 0; Qrep <= QualityUtils.MAX_SAM_QUAL_SCORE; Qrep++ ) { - final double log10prior = RecalDatum.log10QempPrior(Qemp, Qrep); - Assert.assertTrue(log10prior < 0.0); - Assert.assertFalse(Double.isInfinite(log10prior)); - Assert.assertFalse(Double.isNaN(log10prior)); - } - } - - final int Qrep = 20; - int maxQemp = -1; - double maxQempValue = -Double.MAX_VALUE; - for ( int Qemp = 0; Qemp <= QualityUtils.MAX_SAM_QUAL_SCORE; Qemp++ ) { - final double log10prior = RecalDatum.log10QempPrior(Qemp, Qrep); - if ( log10prior > maxQempValue ) { - maxQemp = Qemp; - maxQempValue = log10prior; - } - } - Assert.assertEquals(maxQemp, Qrep); - } - - @Test - public void testBayesianEstimateOfEmpiricalQuality() { - - final int Qrep = 20; - - // test no shift - Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(0, 0, Qrep), (double)Qrep); - Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10, 0, Qrep), (double)Qrep); - Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000, 10, Qrep), (double)Qrep); - - // test small shift - Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10, 10, Qrep), Qrep - 1.0); - Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000, 0, Qrep), Qrep + 1.0); - - // test medium shift - Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10000, 0, Qrep), Qrep + 3.0); - Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10000, 10, Qrep), Qrep + 3.0); - - // test large shift - Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(100000, 10, Qrep), Qrep + 8.0); - Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000000, 10, Qrep), Qrep + 16.0); - } - - @Test - public void testlog10QempLikelihood() { - - final double[] Qemps = new double[] { 0.0, 10.0, 20.0, 30.0 }; - final int[] observations = new int[] {0, 10, 1000, 1000000}; - final int[] errors = new int[] {0, 10, 1000, 1000000}; - - for ( double Qemp : Qemps ) { - for ( int observation : observations ) { - for ( int error : errors ) { - if ( error > observation ) - continue; - - final double log10likelihood = RecalDatum.log10QempLikelihood(Qemp, observation, error); - Assert.assertTrue(observation == 0 ? MathUtils.compareDoubles(log10likelihood, 0.0) == 0 : log10likelihood < 0.0); - Assert.assertFalse(Double.isInfinite(log10likelihood)); - Assert.assertFalse(Double.isNaN(log10likelihood)); - } - } - } - - long bigNum = new Long((long)Integer.MAX_VALUE); - bigNum *= 2L; - final double log10likelihood = RecalDatum.log10QempLikelihood(30, bigNum, 100000); - Assert.assertTrue(log10likelihood < 0.0); - Assert.assertFalse(Double.isInfinite(log10likelihood)); - Assert.assertFalse(Double.isNaN(log10likelihood)); - } - - @Test - public void basicHierarchicalBayesianQualityEstimateTest() { - - for( double epsilon = 15.0; epsilon <= 60.0; epsilon += 2.0 ) { - double RG_Q = 45.0; - RecalDatum RG = new RecalDatum( (long)100000000, (long) (100000000 * 1.0 / (Math.pow(10.0, RG_Q/10.0))), (byte)RG_Q); - double Q = 30.0; - RecalDatum QS = new RecalDatum( (long)100000000, (long) (100000000 * 1.0 / (Math.pow(10.0, Q/10.0))), (byte)Q); - RecalDatum COV = new RecalDatum( (long)15, (long) 1, (byte)45.0); // no data here so Bayesian prior has a huge effect on the empirical quality - - // initial epsilon condition shouldn't matter when there are a lot of observations - Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( epsilon, RG, QS, Collections.singletonList(COV)), Q, 1E-4 ); - } - - for( double epsilon = 15.0; epsilon <= 60.0; epsilon += 2.0 ) { - double RG_Q = 45.0; - RecalDatum RG = new RecalDatum( (long)10, (long) (10 * 1.0 / (Math.pow(10.0, RG_Q/10.0))), (byte)RG_Q); - double Q = 30.0; - RecalDatum QS = new RecalDatum( (long)10, (long) (10 * 1.0 / (Math.pow(10.0, Q/10.0))), (byte)Q); - RecalDatum COV = new RecalDatum( (long)15, (long) 1, (byte)45.0); // no data here so Bayesian prior has a huge effect on the empirical quality - - // initial epsilon condition dominates when there is no data - Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( epsilon, RG, QS, Collections.singletonList(COV)), epsilon, 1E-4 ); - } - - } -} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalUtilsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalUtilsUnitTest.java deleted file mode 100644 index 7fca0be93..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalUtilsUnitTest.java +++ /dev/null @@ -1,178 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; - -public final class RecalUtilsUnitTest extends BaseTest { - private class Row { - int rg, qual, ne, no; - - private Row(final Row copy) { - this(copy.rg, copy.qual, copy.ne, copy.no); - } - - private Row(int rg, int qual, int ne, int no) { - this.rg = rg; - this.qual = qual; - this.ne = ne; - this.no = no; - } - - @Override - public String toString() { - return "Row{" + - "" + rg + - ", " + qual + - ", " + ne + - ", " + no + - '}'; - } - } - - @DataProvider(name = "CombineTablesProvider") - public Object[][] createCombineTablesProvider() { - List tests = new ArrayList(); - - final List rows = new ArrayList(); - for ( final int rg : Arrays.asList(0, 1) ) { - for ( final int qual : Arrays.asList(0, 1) ) { - rows.add(new Row(rg, qual, 1, 10)); - } - } - - logger.warn("Number of rows " + rows.size()); - - List> permutations = new LinkedList>(); - permutations.addAll(Utils.makePermutations(rows, 1, false)); - permutations.addAll(Utils.makePermutations(rows, 2, false)); - permutations.addAll(Utils.makePermutations(rows, 3, false)); - - // adding 1 row to 2 - for ( final List table1 : permutations ) { - for ( final Row table2 : rows ) { - tests.add(new Object[]{table1, Arrays.asList(table2)}); - } - } - - // adding 2 rows to 1 - for ( final List table1 : permutations ) { - for ( final Row table2 : rows ) { - tests.add(new Object[]{Arrays.asList(table2), table1}); - } - } - - for ( final List table1 : permutations ) { - for ( final List table2 : permutations ) { - tests.add(new Object[]{table1, table2}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "CombineTablesProvider") - public void testCombineTables(final List table1, final List table2) { - final NestedIntegerArray nia1 = makeTable(table1); - final NestedIntegerArray nia2 = makeTable(table2); - final List expectedRows = makeExpected(table1, table2); - final NestedIntegerArray expected = makeTable(expectedRows); - RecalUtils.combineTables(nia1, nia2); - - Assert.assertEquals(nia1.getDimensions(), expected.getDimensions()); - Assert.assertEquals(nia1.getAllValues().size(), expected.getAllValues().size()); - - for ( final NestedIntegerArray.Leaf leaf : expected.getAllLeaves() ) { - final RecalDatum actual = nia1.get(leaf.keys); - Assert.assertEquals(actual.getNumMismatches(), leaf.value.getNumMismatches()); - Assert.assertEquals(actual.getNumObservations(), leaf.value.getNumObservations()); - } - } - - public List makeExpected(final List table1, final List table2) { - final List combined = new LinkedList(); - for ( final Row t1 : table1 ) combined.add(new Row(t1)); - for ( final Row t2 : table2 ) { - combine(combined, t2); - } - return combined; - } - - private void combine(final List combined, final Row row) { - for ( final Row c : combined ) { - if ( c.rg == row.rg && c.qual == row.qual ) { - c.ne += row.ne; - c.no += row.no; - return; - } - } - - combined.add(new Row(row)); - } - - public NestedIntegerArray makeTable(final List rows) { - final NestedIntegerArray x = new NestedIntegerArray(3, 3); - for ( final Row r : rows ) - x.put(new RecalDatum((long)r.no, (double)r.ne, (byte)10), r.rg, r.qual); - return x; - } -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationReportUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationReportUnitTest.java deleted file mode 100644 index e38ce4687..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationReportUnitTest.java +++ /dev/null @@ -1,176 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.recalibration.covariates.*; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; -import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -import java.util.*; - -/** - * @author carneiro - * @since 4/21/12 - */ -public class RecalibrationReportUnitTest { - @BeforeMethod - public void init() { - ReadCovariates.clearKeysCache(); - } - - private static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) { - final Random random = new Random(); - final int nObservations = random.nextInt(maxObservations); - final int nErrors = Math.min(random.nextInt(maxErrors), nObservations); - final int qual = random.nextInt(QualityUtils.MAX_SAM_QUAL_SCORE); - return new RecalDatum((long)nObservations, (double)nErrors, (byte)qual); - } - - @Test - public void testOutput() { - final int length = 100; - - List quals = new ArrayList(QualityUtils.MAX_SAM_QUAL_SCORE + 1); - List counts = new ArrayList(QualityUtils.MAX_SAM_QUAL_SCORE + 1); - - for (int i = 0; i<= QualityUtils.MAX_SAM_QUAL_SCORE; i++) { - quals.add((byte) i); - counts.add(1L); - } - - final QuantizationInfo quantizationInfo = new QuantizationInfo(quals, counts); - final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); - - quantizationInfo.noQuantization(); - final List requiredCovariates = new LinkedList(); - final List optionalCovariates = new LinkedList(); - - final ReadGroupCovariate rgCovariate = new ReadGroupCovariate(); - rgCovariate.initialize(RAC); - requiredCovariates.add(rgCovariate); - - final QualityScoreCovariate qsCovariate = new QualityScoreCovariate(); - qsCovariate.initialize(RAC); - requiredCovariates.add(qsCovariate); - - final ContextCovariate cxCovariate = new ContextCovariate(); - cxCovariate.initialize(RAC); - optionalCovariates.add(cxCovariate); - final CycleCovariate cyCovariate = new CycleCovariate(); - cyCovariate.initialize(RAC); - optionalCovariates.add(cyCovariate); - - final Covariate[] requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; - int covariateIndex = 0; - for (final Covariate cov : requiredCovariates) - requestedCovariates[covariateIndex++] = cov; - for (final Covariate cov : optionalCovariates) - requestedCovariates[covariateIndex++] = cov; - - final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("id"); - rg.setPlatform("illumina"); - final GATKSAMRecord read = ReadUtils.createRandomRead(length, false); - read.setReadGroup(rg); - final byte [] readQuals = new byte[length]; - for (int i = 0; i < length; i++) - readQuals[i] = 20; - read.setBaseQualities(readQuals); - - final int expectedKeys = expectedNumberOfKeys(length, RAC.INDELS_CONTEXT_SIZE, RAC.MISMATCHES_CONTEXT_SIZE); - int nKeys = 0; // keep track of how many keys were produced - final ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); - - final RecalibrationTables recalibrationTables = new RecalibrationTables(requestedCovariates); - final NestedIntegerArray rgTable = recalibrationTables.getReadGroupTable(); - final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); - - for (int offset = 0; offset < length; offset++) { - - for (EventType errorMode : EventType.values()) { - - final int[] covariates = rc.getKeySet(offset, errorMode); - final int randomMax = errorMode == EventType.BASE_SUBSTITUTION ? 10000 : 100000; - - rgTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], errorMode.ordinal()); - qualTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], errorMode.ordinal()); - nKeys += 2; - for (int j = 0; j < optionalCovariates.size(); j++) { - final NestedIntegerArray covTable = recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + j); - final int covValue = covariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + j]; - if ( covValue >= 0 ) { - covTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], covValue, errorMode.ordinal()); - nKeys++; - } - } - } - } - Assert.assertEquals(nKeys, expectedKeys); - } - - private static int expectedNumberOfKeys (int readLength, int indelContextSize, int mismatchesContextSize) { - final int numCovariates = 4; - final int numTables = 3; - final int mismatchContextPadding = mismatchesContextSize - 1; - final int indelContextPadding = 2 * (indelContextSize - 1); - final int indelCyclePadding = 2 * (2 * CycleCovariate.CUSHION_FOR_INDELS); - - return (numCovariates * numTables * readLength) - mismatchContextPadding - indelContextPadding - indelCyclePadding; - } - -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationTablesUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationTablesUnitTest.java deleted file mode 100644 index bed21cba1..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationTablesUnitTest.java +++ /dev/null @@ -1,202 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; -import org.broadinstitute.gatk.utils.recalibration.covariates.*; -import org.testng.Assert; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -import java.util.Arrays; -import java.util.List; - -public final class RecalibrationTablesUnitTest extends BaseTest { - private RecalibrationTables tables; - private Covariate[] covariates; - private int numReadGroups = 6; - final byte qualByte = 1; - final List combineStates = Arrays.asList(0, 1, 2); - - @BeforeMethod - private void makeTables() { - covariates = RecalibrationTestUtils.makeInitializedStandardCovariates(); - tables = new RecalibrationTables(covariates, numReadGroups); - fillTable(tables); - } - - private void fillTable(final RecalibrationTables tables) { - for ( int iterations = 0; iterations < 10; iterations++ ) { - for ( final EventType et : EventType.values() ) { - for ( final int rg : combineStates) { - final double error = rg % 2 == 0 ? 1 : 0; - RecalUtils.incrementDatumOrPutIfNecessary(tables.getReadGroupTable(), qualByte, error, rg, et.ordinal()); - for ( final int qual : combineStates) { - RecalUtils.incrementDatumOrPutIfNecessary(tables.getQualityScoreTable(), qualByte, error, rg, qual, et.ordinal()); - for ( final int cycle : combineStates) - RecalUtils.incrementDatumOrPutIfNecessary(tables.getTable(2), qualByte, error, rg, qual, cycle, et.ordinal()); - for ( final int context : combineStates) - RecalUtils.incrementDatumOrPutIfNecessary(tables.getTable(3), qualByte, error, rg, qual, context, et.ordinal()); - } - } - } - } - } - - @Test - public void basicTest() { - final Covariate qualCov = covariates[1]; - final Covariate cycleCov = covariates[2]; - final Covariate contextCov = covariates[3]; - - Assert.assertEquals(tables.numTables(), covariates.length); - - Assert.assertNotNull(tables.getReadGroupTable()); - Assert.assertEquals(tables.getReadGroupTable(), tables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal())); - testDimensions(tables.getReadGroupTable(), numReadGroups); - - Assert.assertNotNull(tables.getQualityScoreTable()); - Assert.assertEquals(tables.getQualityScoreTable(), tables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE.ordinal())); - testDimensions(tables.getQualityScoreTable(), numReadGroups, qualCov.maximumKeyValue() + 1); - - Assert.assertNotNull(tables.getTable(2)); - testDimensions(tables.getTable(2), numReadGroups, qualCov.maximumKeyValue() + 1, cycleCov.maximumKeyValue() + 1); - - Assert.assertNotNull(tables.getTable(3)); - testDimensions(tables.getTable(3), numReadGroups, qualCov.maximumKeyValue() + 1, contextCov.maximumKeyValue() + 1); - } - - private void testDimensions(final NestedIntegerArray table, final int ... dimensions) { - final int[] dim = new int[dimensions.length+1]; - System.arraycopy(dimensions, 0, dim, 0, dimensions.length); - dim[dimensions.length] = EventType.values().length; - Assert.assertEquals(table.getDimensions().length, dim.length); - - for ( int i = 0; i < dim.length; i++ ) { - Assert.assertEquals(table.getDimensions()[i], dim[i], "Table dimensions not expected at dim " + i); - } - } - - @Test - public void basicMakeQualityScoreTable() { - final Covariate qualCov = covariates[1]; - final NestedIntegerArray copy = tables.makeQualityScoreTable(); - testDimensions(copy, numReadGroups, qualCov.maximumKeyValue()+1); - Assert.assertEquals(copy.getAllValues().size(), 0); - } - - @Test - public void testCombine1() { - final RecalibrationTables merged = new RecalibrationTables(covariates, numReadGroups); - fillTable(merged); - - merged.combine(tables); - - for ( int i = 0; i < tables.numTables(); i++ ) { - NestedIntegerArray table = tables.getTable(i); - NestedIntegerArray mergedTable = merged.getTable(i); - - Assert.assertEquals(table.getAllLeaves().size(), mergedTable.getAllLeaves().size()); - for ( final NestedIntegerArray.Leaf leaf : table.getAllLeaves() ) { - final RecalDatum mergedValue = mergedTable.get(leaf.keys); - Assert.assertNotNull(mergedValue); - Assert.assertEquals(mergedValue.getNumObservations(), leaf.value.getNumObservations() * 2); - Assert.assertEquals(mergedValue.getNumMismatches(), leaf.value.getNumMismatches() * 2); - } - } - } - - @Test - public void testCombineEmptyOther() { - final RecalibrationTables merged = new RecalibrationTables(covariates, numReadGroups); - - merged.combine(tables); - - for ( int i = 0; i < tables.numTables(); i++ ) { - NestedIntegerArray table = tables.getTable(i); - NestedIntegerArray mergedTable = merged.getTable(i); - - Assert.assertEquals(table.getAllLeaves().size(), mergedTable.getAllLeaves().size()); - for ( final NestedIntegerArray.Leaf leaf : table.getAllLeaves() ) { - final RecalDatum mergedValue = mergedTable.get(leaf.keys); - Assert.assertNotNull(mergedValue); - Assert.assertEquals(mergedValue.getNumObservations(), leaf.value.getNumObservations()); - Assert.assertEquals(mergedValue.getNumMismatches(), leaf.value.getNumMismatches()); - } - } - } - - @Test - public void testCombinePartial() { - final RecalibrationTables merged = new RecalibrationTables(covariates, numReadGroups); - for ( final int rg : combineStates) { - RecalUtils.incrementDatumOrPutIfNecessary(merged.getTable(3), qualByte, 1, rg, 0, 0, 0); - } - - merged.combine(tables); - for ( int i = 0; i < tables.numTables(); i++ ) { - NestedIntegerArray table = tables.getTable(i); - NestedIntegerArray mergedTable = merged.getTable(i); - - Assert.assertEquals(table.getAllLeaves().size(), mergedTable.getAllLeaves().size()); - for ( final NestedIntegerArray.Leaf leaf : table.getAllLeaves() ) { - final RecalDatum mergedValue = mergedTable.get(leaf.keys); - Assert.assertNotNull(mergedValue); - - final int delta = i == 3 && leaf.keys[1] == 0 && leaf.keys[2] == 0 && leaf.keys[3] == 0 ? 1 : 0; - Assert.assertEquals(mergedValue.getNumObservations(), leaf.value.getNumObservations() + delta); - Assert.assertEquals(mergedValue.getNumMismatches(), leaf.value.getNumMismatches() + delta); - } - } - } -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationTestUtils.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationTestUtils.java deleted file mode 100644 index 306648ca3..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationTestUtils.java +++ /dev/null @@ -1,75 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.recalibration.covariates.*; - -/** - * Created with IntelliJ IDEA. - * User: depristo - * Date: 12/23/12 - * Time: 1:06 PM - * To change this template use File | Settings | File Templates. - */ -public class RecalibrationTestUtils { - public static Covariate[] makeInitializedStandardCovariates() { - final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); - final Covariate[] covariates = new Covariate[4]; - covariates[0] = new ReadGroupCovariate(); - covariates[1] = new QualityScoreCovariate(); - covariates[2] = new ContextCovariate(); - covariates[3] = new CycleCovariate(); - for ( Covariate cov : covariates ) cov.initialize(RAC); - return covariates; - } -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RepeatCovariatesUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RepeatCovariatesUnitTest.java deleted file mode 100644 index 877f4e911..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RepeatCovariatesUnitTest.java +++ /dev/null @@ -1,250 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import com.google.java.contract.Requires; -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.recalibration.covariates.*; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.gatk.utils.BaseUtils; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Random; - -public class RepeatCovariatesUnitTest { - - RepeatLengthCovariate rlCovariate; - RepeatUnitCovariate ruCovariate; - RepeatUnitAndLengthCovariate rurlCovariate; - RecalibrationArgumentCollection RAC; - - - - @BeforeClass - public void init() { - RAC = new RecalibrationArgumentCollection(); - rlCovariate = new RepeatLengthCovariate(); - ruCovariate = new RepeatUnitCovariate(); - rurlCovariate = new RepeatUnitAndLengthCovariate(); - rlCovariate.initialize(RAC); - ruCovariate.initialize(RAC); - rurlCovariate.initialize(RAC); - } - - @BeforeMethod - public void initCache() { - ReadCovariates.clearKeysCache(); - } - - - @Test - public void testFindNumberOfRepetitions() { - // First, test logic to compute number of repetitions of a substring on a given string. - int result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACAC".getBytes(), true); - Assert.assertEquals(2,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACAC".getBytes(), true); - Assert.assertEquals(4,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), true); - Assert.assertEquals(4,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), true); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), true); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), true); - Assert.assertEquals(1,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), true); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), true); - Assert.assertEquals(0,result); - // Same tests but looking backward on string - result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACAC".getBytes(), false); - Assert.assertEquals(2,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACAC".getBytes(), false); - Assert.assertEquals(4,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), false); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), false); - Assert.assertEquals(4,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), false); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), false); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), false); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), false); - Assert.assertEquals(3,result); - - // test logic to get repeat unit and number of repeats from covariate value - final String[] repUnits = new String[]{"AG","CCG","TCCA","T"}; - for (String ru : repUnits) { - for (int k=1; k < 10; k++) { - Pair pair = RepeatLengthCovariate.getRUandNRfromCovariate(String.format("%s%d",ru,k)); - Assert.assertEquals(pair.second.intValue(),k); - Assert.assertEquals(pair.first,ru); - } - } - - } - - /** - * Build synthetic reads with random content made up of tandem repeats, record computed Repeat Unit and # repeats and see if - * they match with read context - */ - @Test - public void testManyObservations() { - final int NUM_UNITS = 10; - final int MAX_REPEAT_UNIT_LENGTH = RAC.MAX_STR_UNIT_LENGTH; - final int MAX_NUM_REPETITIONS = RAC.MAX_REPEAT_LENGTH; - final int NUM_TEST_CASES = 100; - - Random random = new Random(); - - for (int r = 0; r < NUM_TEST_CASES; r++) { - final StringBuilder sb = new StringBuilder(); - // for each unit, generate a repeat unit at random with given random length - final ArrayList repeatUnits = new ArrayList(); - final ArrayList numsRepetitions = new ArrayList(); - for (int n=0; n < NUM_UNITS; n++) { - final int repLength = 1+random.nextInt(MAX_REPEAT_UNIT_LENGTH); - final String repeatUnit = getRandomBases(repLength); - final int numRepetitions = 1+random.nextInt(MAX_NUM_REPETITIONS); - - // log for comparison with covariate - numsRepetitions.add(numRepetitions); - repeatUnits.add(repeatUnit); - - for (int k=0; k < numRepetitions; k++) - sb.append(repeatUnit); - - } - - final String readBases = sb.toString(); - System.out.println(readBases); - final int readLength = readBases.length(); - - final byte[] readQuals = new byte[readLength]; - Arrays.fill(readQuals,(byte)30); - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(),readQuals,readLength+"M"); - - Covariate[] requestedCovariates = new Covariate[3]; - requestedCovariates[0] = rlCovariate; - requestedCovariates[1] = ruCovariate; - requestedCovariates[2] = rurlCovariate; - ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); - - // check that the length is correct - Assert.assertEquals(rc.getMismatchesKeySet().length, readLength); - Assert.assertEquals(rc.getInsertionsKeySet().length, readLength); - Assert.assertEquals(rc.getDeletionsKeySet().length, readLength); - - for (int offset = 0; offset < readBases.length(); offset++) { // recalibrate all bases in the read - // check RepeatLength - final String rlValM = rlCovariate.formatKey(rc.getMismatchesKeySet(offset)[0]); - final String rlValI = rlCovariate.formatKey(rc.getInsertionsKeySet(offset)[0]); - final String rlValD = rlCovariate.formatKey(rc.getDeletionsKeySet(offset)[0]); - // check RepeatUnit - final String ruValM = ruCovariate.formatKey(rc.getMismatchesKeySet(offset)[1]); - final String ruValI = ruCovariate.formatKey(rc.getInsertionsKeySet(offset)[1]); - final String ruValD = ruCovariate.formatKey(rc.getDeletionsKeySet(offset)[1]); - // check RepeatUnitAndLength - final String rurlValM = rurlCovariate.formatKey(rc.getMismatchesKeySet(offset)[2]); - final String rurlValI = rurlCovariate.formatKey(rc.getInsertionsKeySet(offset)[2]); - final String rurlValD = rurlCovariate.formatKey(rc.getDeletionsKeySet(offset)[2]); - // check all 3 values are identical - Assert.assertEquals(rlValD,rlValI); - Assert.assertEquals(rlValM,rlValI); - Assert.assertEquals(ruValD,ruValI); - Assert.assertEquals(ruValM,ruValI); - Assert.assertEquals(rurlValD,rurlValI); - Assert.assertEquals(rurlValM,rurlValI); - - - int fw = GATKVariantContextUtils.findNumberOfRepetitions(ruValM.getBytes(), readBases.substring(offset + 1, readLength).getBytes(), true); - int bw = GATKVariantContextUtils.findNumberOfRepetitions(ruValM.getBytes(), readBases.substring(0, offset + 1).getBytes(), false); - Assert.assertEquals(Math.min(fw+bw,RAC.MAX_REPEAT_LENGTH),(int)Integer.valueOf(rlValM)); - } - - } - - - - - - - } - - /** - * Returns random bases of given length - * @param length required length - * @return given random string - */ - @Requires("length > 0") - String getRandomBases(final int length) { - byte[] bases = new byte[length]; - Random ran = new Random(); - for (int i=0; i < length; i++ ) { - int idx = ran.nextInt(4); - bases[i] = BaseUtils.baseIndexToSimpleBase(idx); - } - return new String(bases); - } - - -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/variant/VariantContextMergerUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/variant/VariantContextMergerUnitTest.java deleted file mode 100644 index eaa76becb..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/variant/VariantContextMergerUnitTest.java +++ /dev/null @@ -1,279 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.variant; - -import htsjdk.variant.variantcontext.*; -import org.broadinstitute.gatk.utils.*; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; -import org.testng.Assert; -import org.testng.annotations.BeforeSuite; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -/** - * Tests {@link org.broadinstitute.gatk.utils.variant.ReferenceConfidenceVariantContextMerger}. - * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> - */ -public class VariantContextMergerUnitTest extends BaseTest { - Allele Aref, T, C, G, Cref, ATC, ATCATC; - Allele ATCATCT; - Allele ATref; - Allele Anoref; - Allele GT; - - private GenomeLocParser genomeLocParser; - - @BeforeSuite - public void setup() throws IOException { - // alleles - Aref = Allele.create("A", true); - Cref = Allele.create("C", true); - T = Allele.create("T"); - C = Allele.create("C"); - G = Allele.create("G"); - ATC = Allele.create("ATC"); - ATCATC = Allele.create("ATCATC"); - ATCATCT = Allele.create("ATCATCT"); - ATref = Allele.create("AT",true); - Anoref = Allele.create("A",false); - GT = Allele.create("GT",false); - genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(hg18Reference))); - } - - @Test(dataProvider = "referenceConfidenceMergeData") - public void testReferenceConfidenceMerge(final String testID, final List toMerge, final GenomeLoc loc, final boolean returnSiteEvenIfMonomorphic, final VariantContext expectedResult) { - final VariantContext result = ReferenceConfidenceVariantContextMerger.merge(toMerge, loc, returnSiteEvenIfMonomorphic ? (byte) 'A' : null, true); - if ( result == null ) { - Assert.assertTrue(expectedResult == null); - return; - } - Assert.assertEquals(result.getAlleles(), expectedResult.getAlleles(),testID); - Assert.assertEquals(result.getNSamples(), expectedResult.getNSamples(),testID); - for ( final Genotype expectedGenotype : expectedResult.getGenotypes() ) { - Assert.assertTrue(result.hasGenotype(expectedGenotype.getSampleName()), "Missing " + expectedGenotype.getSampleName()); - // use string comparisons to test equality for now - Assert.assertEquals(result.getGenotype(expectedGenotype.getSampleName()).toString(), expectedGenotype.toString()); - } - } - - @Test - public void testGenerateADWithNewAlleles() { - - final int[] originalAD = new int[] {1,2,0}; - final int[] indexesOfRelevantAlleles = new int[] {0,1,2,2}; - - final int[] newAD = ReferenceConfidenceVariantContextMerger.generateAD(originalAD, indexesOfRelevantAlleles); - Assert.assertEquals(newAD, new int[]{1,2,0,0}); - } - - - @Test(expectedExceptions = UserException.class) - public void testGetIndexesOfRelevantAllelesWithNoALT() { - - final List alleles1 = new ArrayList<>(1); - alleles1.add(Allele.create("A", true)); - final List alleles2 = new ArrayList<>(1); - alleles2.add(Allele.create("A", true)); - ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(alleles1, alleles2, -1); - Assert.fail("We should have thrown an exception because the allele was not present"); - } - - @Test(dataProvider = "getIndexesOfRelevantAllelesData") - public void testGetIndexesOfRelevantAlleles(final int allelesIndex, final List allAlleles) { - final List myAlleles = new ArrayList<>(3); - - // always add the reference and alleles - myAlleles.add(allAlleles.get(0)); - myAlleles.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - // optionally add another alternate allele - if ( allelesIndex > 0 ) - myAlleles.add(allAlleles.get(allelesIndex)); - - final int[] indexes = ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(myAlleles, allAlleles, -1); - - Assert.assertEquals(indexes.length, allAlleles.size()); - - for ( int i = 0; i < allAlleles.size(); i++ ) { - if ( i == 0 ) - Assert.assertEquals(indexes[i], 0); // ref should always match - else if ( i == allelesIndex ) - Assert.assertEquals(indexes[i], 2); // allele - else - Assert.assertEquals(indexes[i], 1); // - } - } - - - @DataProvider(name = "referenceConfidenceMergeData") - public Object[][] makeReferenceConfidenceMergeData() { - final List tests = new ArrayList<>(); - final int start = 10; - final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, start, start); - final VariantContext VCbase = new VariantContextBuilder("test", "20", start, start, Arrays.asList(Aref)).make(); - final VariantContext VCprevBase = new VariantContextBuilder("test", "20", start-1, start-1, Arrays.asList(Aref)).make(); - - final int[] standardPLs = new int[]{30, 20, 10, 71, 72, 73}; - final int[] reorderedSecondAllelePLs = new int[]{30, 71, 73, 20, 72, 10}; - - final List noCalls = new ArrayList<>(2); - noCalls.add(Allele.NO_CALL); - noCalls.add(Allele.NO_CALL); - - final List A_ALT = Arrays.asList(Aref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gA_ALT = new GenotypeBuilder("A").PL(new int[]{0, 100, 1000}).alleles(noCalls).make(); - final VariantContext vcA_ALT = new VariantContextBuilder(VCbase).alleles(A_ALT).genotypes(gA_ALT).make(); - final Allele AAref = Allele.create("AA", true); - final List AA_ALT = Arrays.asList(AAref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gAA_ALT = new GenotypeBuilder("AA").PL(new int[]{0, 80, 800}).alleles(noCalls).make(); - final VariantContext vcAA_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_ALT).genotypes(gAA_ALT).make(); - final List A_C = Arrays.asList(Aref, C); - final Genotype gA_C = new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10}).alleles(noCalls).make(); - final List A_C_ALT = Arrays.asList(Aref, C, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gA_C_ALT = new GenotypeBuilder("A_C").PL(standardPLs).alleles(noCalls).make(); - final VariantContext vcA_C_ALT = new VariantContextBuilder(VCbase).alleles(A_C_ALT).genotypes(gA_C_ALT).make(); - final List A_G_ALT = Arrays.asList(Aref, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gA_G_ALT = new GenotypeBuilder("A_G").PL(standardPLs).alleles(noCalls).make(); - final VariantContext vcA_G_ALT = new VariantContextBuilder(VCbase).alleles(A_G_ALT).genotypes(gA_G_ALT).make(); - final List A_C_G = Arrays.asList(Aref, C, G); - final Genotype gA_C_G = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30}).alleles(noCalls).make(); - final List A_C_G_ALT = Arrays.asList(Aref, C, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gA_C_G_ALT = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30, 71, 72, 73, 74}).alleles(noCalls).make(); - final VariantContext vcA_C_G_ALT = new VariantContextBuilder(VCbase).alleles(A_C_G_ALT).genotypes(gA_C_G_ALT).make(); - final List A_ATC_ALT = Arrays.asList(Aref, ATC, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gA_ATC_ALT = new GenotypeBuilder("A_ATC").PL(standardPLs).alleles(noCalls).make(); - final VariantContext vcA_ATC_ALT = new VariantContextBuilder(VCbase).alleles(A_ATC_ALT).genotypes(gA_ATC_ALT).make(); - final Allele A = Allele.create("A", false); - final List AA_A_ALT = Arrays.asList(AAref, A, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gAA_A_ALT = new GenotypeBuilder("AA_A").PL(standardPLs).alleles(noCalls).make(); - final VariantContext vcAA_A_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_A_ALT).genotypes(gAA_A_ALT).make(); - - // first test the case of a single record - tests.add(new Object[]{"test00",Arrays.asList(vcA_C_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C).make()}); - - // now, test pairs: - // a SNP with another SNP - tests.add(new Object[]{"test01",Arrays.asList(vcA_C_ALT, vcA_G_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, new GenotypeBuilder("A_G").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()}); - // a SNP with an indel - tests.add(new Object[]{"test02",Arrays.asList(vcA_C_ALT, vcA_ATC_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC)).genotypes(gA_C_ALT, new GenotypeBuilder("A_ATC").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()}); - // a SNP with 2 SNPs - tests.add(new Object[]{"test03",Arrays.asList(vcA_C_ALT, vcA_C_G_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, gA_C_G).make()}); - // a SNP with a ref record - tests.add(new Object[]{"test04",Arrays.asList(vcA_C_ALT, vcA_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gA_ALT).make()}); - - // spanning records: - // a SNP with a spanning ref record - tests.add(new Object[]{"test05",Arrays.asList(vcA_C_ALT, vcAA_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gAA_ALT).make()}); - // a SNP with a spanning deletion - tests.add(new Object[]{"test06",Arrays.asList(vcA_C_ALT, vcAA_A_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73}).alleles(noCalls).make()).make()}); - - // combination of all - tests.add(new Object[]{"test07",Arrays.asList(vcA_C_ALT, vcA_G_ALT, vcA_ATC_ALT, vcA_C_G_ALT, vcA_ALT, vcAA_ALT, vcAA_A_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, G, ATC)).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73, 71, 72, 73, 73}).alleles(noCalls).make(), - new GenotypeBuilder("A_G").PL(new int[]{30, 71, 73, 20, 72, 10, 71, 73, 72, 73}).alleles(noCalls).make(), - new GenotypeBuilder("A_ATC").PL(new int[]{30, 71, 73, 71, 73, 73, 20, 72, 72, 10}).alleles(noCalls).make(), - new GenotypeBuilder("A_C_G").PL(new int[]{40,20,30,20,10,30,71,72,73,74}).alleles(noCalls).make(), - new GenotypeBuilder("A").PL(new int[]{0, 100, 1000, 100, 1000, 1000, 100, 1000, 1000, 1000}).alleles(noCalls).make(), - new GenotypeBuilder("AA").PL(new int[]{0, 80, 800, 80, 800, 800, 80, 800, 800, 800}).alleles(noCalls).make(), - new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 71, 73, 73, 71, 73, 73, 73}).alleles(noCalls).make()).make()}); - - // just spanning ref contexts, trying both instances where we want/do not want ref-only contexts - tests.add(new Object[]{"test08",Arrays.asList(vcAA_ALT), - - loc, false, - null}); - tests.add(new Object[]{"test09", Arrays.asList(vcAA_ALT), - loc, true, - new VariantContextBuilder(VCbase).alleles(Arrays.asList(Allele.create("A", true))).genotypes(new GenotypeBuilder("AA").PL(new int[]{0}).alleles(noCalls).make()).make()}); - - final Object[][] result = tests.toArray(new Object[][]{}); - return result; - } - @DataProvider(name = "getIndexesOfRelevantAllelesData") - public Object[][] makeGetIndexesOfRelevantAllelesData() { - final int totalAlleles = 5; - final List alleles = new ArrayList<>(totalAlleles); - alleles.add(Allele.create("A", true)); - for ( int i = 1; i < totalAlleles; i++ ) - alleles.add(Allele.create(Utils.dupString('A', i + 1), false)); - - final List tests = new ArrayList<>(); - - for ( int alleleIndex = 0; alleleIndex < totalAlleles; alleleIndex++ ) { - tests.add(new Object[]{alleleIndex, alleles}); - } - - return tests.toArray(new Object[][]{}); - } -} diff --git a/protected/pom.xml b/protected/pom.xml index d16d667c2..55f893152 100644 --- a/protected/pom.xml +++ b/protected/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-root - 3.3 + 3.4-SNAPSHOT ../public/gatk-root diff --git a/public/VectorPairHMM/pom.xml b/public/VectorPairHMM/pom.xml index ca0808701..9c4fa1347 100644 --- a/public/VectorPairHMM/pom.xml +++ b/public/VectorPairHMM/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-root - 3.3 + 3.4-SNAPSHOT ../../public/gatk-root diff --git a/public/VectorPairHMM/src/main/c++/Sandbox.java b/public/VectorPairHMM/src/main/c++/Sandbox.java index 605c5f5e8..ae8497ea7 100644 --- a/public/VectorPairHMM/src/main/c++/Sandbox.java +++ b/public/VectorPairHMM/src/main/c++/Sandbox.java @@ -69,7 +69,7 @@ public class Sandbox { * change per JVM session * @param readDataHolderClass class type of JNIReadDataHolderClass * @param haplotypeDataHolderClass class type of JNIHaplotypeDataHolderClass - * @param mask mask is a 64 bit integer identical to the one received from jniGetMachineType(). Users can disable usage of some hardware features by zeroing some bits in the mask + * @param mask 64 bit integer identical to the one received from jniGetMachineType(). Users can disable usage of some hardware features by zeroing bits in the mask * */ private native void jniInitializeClassFieldsAndMachineMask(Class readDataHolderClass, Class haplotypeDataHolderClass, long mask); @@ -156,7 +156,7 @@ public class Sandbox { public native void jniClose(); public void close() { - System.out.println("Time spent in setup for JNI call : "+(setupTime*1e-9)+" compute time : "+(computeTime*1e-9)); + System.err.println("Time spent in setup for JNI call : " + (setupTime * 1e-9) + " compute time : " + (computeTime * 1e-9)); jniClose(); } @@ -170,8 +170,8 @@ public class Sandbox { } catch(FileNotFoundException e) { - System.err.println("File "+filename+" cannot be found/read"); - return; + System.err.println("File "+filename + " cannot be found/read"); + return; } int idx = 0; int numReads = 0; diff --git a/public/VectorPairHMM/src/main/c++/utils.cc b/public/VectorPairHMM/src/main/c++/utils.cc index 3b0ce35ee..89bd975ae 100644 --- a/public/VectorPairHMM/src/main/c++/utils.cc +++ b/public/VectorPairHMM/src/main/c++/utils.cc @@ -154,20 +154,20 @@ void initialize_function_pointers(uint64_t mask) //mask = (1 << SSE41_CUSTOM_IDX); if(is_avx_supported() && (mask & (1<< AVX_CUSTOM_IDX))) { - cout << "Using AVX accelerated implementation of PairHMM\n"; + cerr << "Using AVX accelerated implementation of PairHMM\n"; g_compute_full_prob_float = compute_full_prob_avxs; g_compute_full_prob_double = compute_full_prob_avxd; } else if(is_sse41_supported() && (mask & ((1<< SSE41_CUSTOM_IDX) | (1<; g_compute_full_prob_double = compute_full_prob_ssed; } else { - cout << "Using un-vectorized C++ implementation of PairHMM\n"; + cerr << "Using un-vectorized C++ implementation of PairHMM\n"; g_compute_full_prob_float = compute_full_prob; g_compute_full_prob_double = compute_full_prob; } @@ -300,16 +300,16 @@ void tokenize(std::ifstream& fptr, std::vector& tokens) { myVec.push_back(tmp); ++i; - //std::cout < 0) break; } tokens.clear(); - //std::cout << "Why "<i = new char[tc->rslen]; tc->d = new char[tc->rslen]; tc->c = new char[tc->rslen]; - //cout << "Lengths "<haplen <<" "<rslen<<"\n"; + //cerr << "Lengths "<haplen <<" "<rslen<<"\n"; memcpy(tc->rs, tokens[1].c_str(),tokens[1].size()); assert(tokens.size() == (size_t)(2 + 4*(tc->rslen))); //assert(tc->rslen < MROWS); @@ -522,7 +522,7 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size, double rel_error = (baseline_result != 0) ? fabs(abs_error/baseline_result) : 0; if(abs_error > 1e-5 && rel_error > 1e-5) { - cout << std::scientific << baseline_result << " "<not] thread safe [because it is immutable]. - * - * @author I. M. Coder - * @version $Revision$ - */ -class CrispConcept -{ - public static final String ID = "$Id$"; -\end{lstlisting} - - -You are encouraged to provide as much explanatory material as you feel is helpful following that first, summary sentence. Information on algorithms and other information that will help a client make appropriate use of the class is particularly welcome. (But see Stateful Interfaces, and Rules for Use, below.) -Tell us whether your class is thread-safe or not. Thread safety due to immutability is particularly well worth mentioning. (It may warn a maintenance programmer off adding the set methods that you apparently forgot to provide.) -All methods must have the following, minimum level of javadoc: -\lstset{language=Java, caption=Good Method Javadoc, frame=leftline, label=MethodJavadoc,basicstyle=\small} -\lstset{tabsize=4} -\begin{lstlisting} -/** - * One crisp, informative sentence or noun phrase that explains - * what the method does. - * - * @param parm1 Parm1 selects the widget to be frobnicated. Cannot be null. - * @param parm2 Parm2 specifies the type of frobnication to apply. - * @return The frobnicated widget. - * @throws FrobnicationException Thrown if widget isnt frobnicable. - */ -\end{lstlisting} -For each parameter of reference type, tell us whether the reference may be null. -\section{Imports} -There is a balance between trying to maintain lengthy lists of classes imported one-by-one on the one hand, and importing many packages wholesale using an asterisk on the other. It shouldnt often be a big problem: there are exceptions, but a class that makes use of dozens of other classes may be trying to tell you that it needs some redesign. The suggestion is to import classes explicitly from packages we have writtenespecially those under active development. Also import explicitly when using just a few classes from a given package. Try to restrict whole-package imports to well tested, slowly changing third party packages. (Packages in the JDK, for example, are reasonable to import as a whole.) -\section{Order of Class Members} -Classes should be laid out consistently. You may put the member fields at either the bottom or the top, but you must not sprinkle them throughout. -There is an argument that a class ought to be ordered with its public constants and constructors at the very top, its public methods next, and its internal stuff last, since that concentrates at the very top what a programmer needs to know to make use of the class. This isnt obligatory, but you may wish to give it some consideration. -Nested classes go at the very bottom, after everything else. -\section{Make em Pretty} -Use four space tabs. Dont omit braces around single statements. Line things up so that its clear what things are on the same level. Give us enough white space to make it pretty. - - -\chapter{Design Considerations} -If youre doing things right (and the DoD doesnt) design isnt a distinct phase that ends when coding begins. Ideally, youll develop a comprehensive, top-down design before you begin coding. This may take more than one napkin. Even so, youll face many decisions about implementation details that are not completely specified by the overall design. In other words, its inevitable that youll be doing design while you code. What follows are some coding standards for you to consider when doing this implementation-phase design. -\section{Encapsulation} -With very rare exception, all fieldsboth instance members and static membersshould be private. Protected and default (i.e., package-scoped) access is like public access, only less so. (Making a change in such a field still requires you to locate and analyze use of the field in indefinitely many files: for public fields you need to look everywhere, for protected fields you need only scan everything that extends you, and for default-access fields, you need only scan everything in the same package. This is arduous for those poor souls who must try to maintain your code.) -Always use the most restrictive permission consistent with the design of your class. Dont make all of your internal methods protected in the vain hope that someday, some extending class might need to tweak your internal state, and youll make it easy. -Similarly, dont provide a getter and setter for every piece of your internal state: the goal is to meet the contract of the interfaces you implement, and to hide the details of how you do it. Even the most legitimately passive of objectsthe model objects youve just hauled out of the relational databasewill likely have private, internal state that should not be exposed directly to clients. (Check out all the hidden state in EJB entity beans, for example.) -\section{Is-A versus Has-A} -A subclass and its superclass have an Is-A (or specialization-generalization) relationship. A class and a component of that class have a Has-A (or containment) relationship. If you have a crisp, clear idea of what kind of things two classes represent, then simply saying to yourself Thing A is a (special kind of a) Thing B, and Thing A has a Thing B (as one of its parts), will often make it clear what the relationship should be: one of the two sentences may sound very odd. A Dog is an Animal. A Car has a Steering Wheel. So Dog extends (or implements) Animal. (And not vice versa.) A Car has a Steering Wheel as one of its members. Sometimes people try to save programming (or computer) time by adding properties to a Steering Wheel to try to turn it into a Car. This is a very bad idea: think of the trouble youll have changing the Cars steering wheel for a nice padded-leather model if the Car is the Steering Wheel. Think of the trouble that youll have comparing Steering Wheels if some of them are Cars. -\section{Redundant data} -If you have only one copy of a given datum, it will be either right or wrong, but it wont be inconsistent with other data. Guaranteeing consistency is a great deal more complex than guaranteeing accuracy. Checking for and maintaining consistency among multiple copies of a datum often robs you of the efficiency you hoped to gain by the denormalization; not checking for and not maintaining consistency is a very frequent source of hard-to-fix bugs, and weird, unreliable program behavior. -\section{Exceptions} -Exceptions handle, well, exceptional conditions. Properly used, they provide a last-gasp attempt to allow a robust program to clean-up and recover from catastrophic situations. Exceptions should not be thrown frequently, certainly not as a part of the normal, expected flow of a program. Do not use them as a nifty hack for implementing non-local transfer of control. (Exceptions are far more expensive than normal returns, so the performance wizards wont be tempted to do this, anyway. For those who care more about well-designed, maintainable code than about saving clicks, youll realize that code that relies on exceptions for normal flow is just too difficult to comprehend and debug.) -RuntimeExceptions are for even more rare, more catastrophic situations from which recovery is unlikely, at best. -In code which will be called by general clients outside your package, catch and re-throw exceptions from the lower layers of code on which you depend to give a more package-oriented explanation of the bad thing that happened. However, preserve information when you do this: Wrap the original exception in a new exception that supplements rather than replaces the original message. And respond to all flavors of printStackTrace with the nested exceptions stack trace (i.e., delegate these methods to the nested exception). -Never create exceptions with null messages. -\section{Recovering External Resources} -Java frees you from having to worry about memory as a resource. (Well, it reduces the worry, anyway.) Therefore you should have oodles of time left over to make certain that you free other resources when youre done with them. Two key external resources that you must make certain to release are open streams (which chew up a precious operating system file handle), and database connections (which chew up precious DBMS memory). The only really reliable way to make certain that these resources get freed is to create and use them within a single try block, and to release them in the finally clause. -Try to avoid designs that require a class to maintain an open stream. One technique is to use an event-driven model to turn the file processing upside down: you can still have a nicely modular and reusable class while processing the file within the scope of a single block by using Listeners. -\section{Stateful Interfaces and Rules for Use} -Good interfaces are concise, comprehensive, and orthogonal. Concise means that there are no superfluous operations that dont seem to fit the underlying abstraction, and that there is one good way of accomplishing a given end, not a variety of ways from which you must choose. Comprehensive means that everything you might need to do in manipulating the object has been provided for. And orthogonal means that each method does something independent, and that any method can be called at any time. This is very difficult to achieve, but is an ideal toward which we must strive. Its hard work. -Poor interfaces are cluttered with Rules for Use. If you are lucky, these rules are made explicit in documentation: Be sure to call this method before calling that one, but never call this method if youve ever called that one, and do, please, remember to call this one when youre all done. Needless to say, these interfaces are very difficult for clients to use correctly. -One particularly common form of this blight is the Stateful Interface: the object has a lifecycle, and certain methods are appropriate only when the object is in some particular phase of the lifecycle. For a simple example, read the javadoc for the java.sql.CallableStatement class. It describes how you must call registerOutputParameter before calling execute, how, for maximum portability, you should not call getMoreResults if you have called any of the getOutputParameter methods, and how you should remember to call close when youre all done. (And you thought we were exaggerating!) -If you cant seem to work out how to avoid Rules for Use on your interfaces, you must at the very least make sure that each method call detects misuse, and throws an appropriate exception or does something other than trash your internal state. Every public method, if it affects object state at all, must transform the object from one valid, consistent state to another valid, consistent state. -If you need some ego-incentive to motivate you, consider this: Hard-to-use, hard-to-understand, hard-to-maintain code is quickly replaced after you cease to maintain it. What kind of legacy is that? -Sometimes you can see several distinct patterns of use among the clients of an interface: they might be telling you that you need to re-factor the interface into two separate interfaces. -\section{Multithreading} -Making your implementations thread-safe is a enormously complex issue. Unfortunately, almost all of us are doing some work in multi-threaded environments (writing servlets, for example), and it's an issue that we are forced to confront. -If you havent examined the issue for one of the classes youve implemented (either because you dont anticipate its being used in a multi-threaded environment, or because the whole thing makes your brain ache), please provide a javadoc comment for the class indicating that it is not thread-safe. If youre not sure, its not safe! -Dont just synchronize every method. Synchronization is far too expensive to use carelessly. (Less so than it used to be, but still expensive. And it doesnt resolve all multi-threading issues, anyway.) -One way of beginning to address the issue of thread-safety is to understand what doesnt need any special thread-safety code, and try to produce as much of that as you can. Here are a couple of quick tips. -Objects that can be seen only by a single thread are immune from the issue: If the only references to an object are from local variablesthat is, if a reference to the object is never stored in an instance or static fieldit will be visible only from the thread that creates it. -Immutable objects are automatically thread-safe. If you cant change it, you cant see it in an inconsistent, intermediate state. -\section{Canonical methods} -Most simple classes either do or are. In the EJB environment session beans are the do classes, and entity beans are the are classes. In the model-view-controller paradigm, model objects are the are objects, controllers are the do objects, and views are mostly are, but typically also have a little bit of do flavor. So, you see, it does depend on what your definition of is is. -The point of the distinction is that most of the are classes, those often immutable little bags of independent state, usually need to override equals and hashCode to behave properly. You must implement these basic object operations in each of your passive, data-bearing classes; you may wish (or need) to implement them in the others. -The ares are typically more useful when Comparableimplementing that interface allows you to put them into sorted Collectionsso you ought to consider that next. -Being Cloneable and Serializable usually come for free (no code to write), so throw those into the mix, too, unless theres a compelling reason not to. An example of a reason not to might be that you need to maintain uniqueness at a level of abstraction higher than object identityyou dont want to allow clients to make copies. -\section{Performance} -Your overall designyour selection of algorithms and data structures, for examplehas a far greater impact on performance than any little hacks you can apply while implementing the code. So design for performance, and implement for clarity. -Nonetheless, the java compiler that most of us use can use a little help in doing optimization. Dont expect order-of-magnitude performance gainsyoull get those by designing away I/O, replacing searches with hashes, etc.these are percentage point tweaks. -Move invariant code out of loops. For example, many for loops can calculate their terminating condition once, before the loop starts, rather than at each iteration through the loop. -\lstset{language=Java, caption=Improvements to loops, frame=leftline, label=loopInprovements,basicstyle=\small} -\lstset{tabsize=4} -\begin{lstlisting} -for ( int iii = 0; iii < str.length(); ++iii ) // bad - -int nnn = str.length(); -for ( int iii = 0; iii < nnn; ++iii ) // good -\end{lstlisting} -Strength reduction: do a simple calculation to update the value of some variable using the value it has from a previous trip through a loop, rather than from scratch each time. -\lstset{language=Java, caption=Good looping practices, frame=leftline, label=loopPractices,basicstyle=\small} -\lstset{tabsize=4} -\begin{lstlisting} -for ( int iii = 0; iii < nnn; ++iii ) // bad -{ - double val = pow( 2., iii ); - . . . -} - -double val = 1.; -for ( int iii = 0; iii < nnn; ++iii ) // good -{ - . . . - val *= 2.; -} -\end{lstlisting} -Avoid some performance dogs in the SDK: Use the underlying Stream classes rather than Readers when appropriate. Use the newer, non-synchronized collection classes rather than Vector and Hashtable. -Penalties -Code that fails to follow these guidelines will be posted around the MIT campus, along with the authors email address and an urgent request for comments. - -\end{document} \ No newline at end of file diff --git a/public/doc/README b/public/doc/README index e70ced0df..a6b5ca008 100644 --- a/public/doc/README +++ b/public/doc/README @@ -1,86 +1,3 @@ -The Genome Analysis Toolkit (GATK) -Copyright (c) 2009 The Broad Institute - -Overview --------- -The Genome Analysis Toolkit (GATK) is a structured programming -framework designed to enable rapid development of efficient and robust -analysis tools for next-generation DNA sequencers. The GATK solves -the data management challenge by separating data access patterns from -analysis algorithms, using the functional programming philosophy of -Map/Reduce. Consequently, the GATK is structured into data traversals -and data walkers that interact through a programming contract in which -the traversal provides a series of units of data to the walker, and -the walker consumes each datum to generate an output for each datum. -Because many tools to analyze next-generation sequencing data access -the data in a very similar way, the GATK can provide a small but -nearly comprehensive set of traversal types that satisfying the data -access needs of the majority of analysis tools. For example, -traversals "by each sequencer read" and "by every read covering -each locus in a genome" are common throughout many tools such as -counting reads, building base quality histograms, reporting average -coverage of the genome, and calling SNPs. The small number of these -traversals, shared among many tools enables the core GATK development -team to optimize such traversals for correctness, stability, CPU -performance, memory footprint, and in many cases to even automatically -parallelize calculations. Moreover, since the traversal engine -encapsulates the complexity of efficiently accessing the -next-generation sequencing data, researchers and developers are free -to focus on their specific analysis algorithms. This not only vastly -improves productivity of the developers, who can quickly write new -analyses, but also results in tools that are efficient and robust and -can benefit from improvement to a common data management engine. - -Capabilities ------------- -The GenomeAnalysisTK development environment is currently provided as -a platform-independent Java programming language library. The core -system works with the nascent standard Sequence Alignment/Map (SAM) -format to represent reads using a production-quality SAM library -developed at the Broad. The system can access a variety of metadata -files such as dbSNP, Hapmap, RefSeq as well as work with genotype and -SNP files in GLF, Geli, and other common formats. The core system -handles read data from Illumina/Solexa, SOLiD, and Roche/454. The -current GATK engine can process all of the 1000 genomes data -representing ~5Tb of data from these three technologies produced from -multiple sequencing centers and aligned to the human reference genome -with multiple aligners. The GATK currently provides traversals by -each read (ByRead traversal), by all reads covering each locus in the -genome (ByLoci traversal), and by all reads within pre-specified -intervals on the genome (ByWindow traversal). - -Dependencies ------------- -The GATK relies on a Java 6-compatible JRE. At the time of this writing, -the GATK team tests with Sun JRE version 1.6.0_12-b04. Additionally, the -GATK requires as inputs a sorted, indexed BAM file containing aligned reads -and a fasta-format reference with associated dictionary file (.dict)and -index (.fasta.fai). - -Instructions for preparing input files are available here: - -http://www.broadinstitute.org/gatk/guide/article?id=1204 - -The bundled 'resources' directory contains an example BAM and fasta. - -Getting Started ---------------- -The GATK is distributed with a few standard analyses, including PrintReads, -Pileup, and DepthOfCoverage. More information on the included walkers is -available here: - -http://www.broadinstitute.org/gatk/gatkdocs - -To print the reads of the included sample data, untar the package into -the GenomeAnalysisTK directory and run the following command: - -java -jar GenomeAnalysisTK/GenomeAnalysisTK.jar \ - -T PrintReads \ - -R GenomeAnalysisTK/resources/exampleFASTA.fasta \ - -I GenomeAnalysisTK/resources/exampleBAM.bam - -Support -------- -Documentation for the GATK is available at http://www.broadinstitute.org/gatk/guide. -For help using the GATK, developing analyses with the GATK, bug reports, -or feature requests, please visit our support forum at http://gatkforums.broadinstitute.org/ +The Genome Analysis Toolkit +============ +See http://www.broadinstitute.org/gatk/ \ No newline at end of file diff --git a/public/external-example/pom.xml b/public/external-example/pom.xml index 506580502..46bdab5d2 100644 --- a/public/external-example/pom.xml +++ b/public/external-example/pom.xml @@ -9,7 +9,7 @@ External Example - 3.3 + 3.4-SNAPSHOT - process-resources + prepare-package package @@ -49,7 +49,15 @@ org.broadinstitute.gatk - gatk-tools-public + gatk-utils + ${gatk.version} + test-jar + test + + + + org.broadinstitute.gatk + gatk-engine ${gatk.version} test-jar test @@ -82,7 +90,7 @@ org.broadinstitute.gatk - gatk-engine + gatk-utils ${gatk.version} example-resources tar.bz2 @@ -111,10 +119,9 @@ ${project.build.outputDirectory} - org.broadinstitute.gatk - - gatk-tools-public - ${gatk.version} + ${project.groupId} + ${project.artifactId} + ${project.version} 2g false @@ -138,6 +145,7 @@ true + false @@ -147,7 +155,7 @@ - samtools:htsjdk + com.github.samtools:htsjdk ** @@ -252,7 +260,32 @@ + + true + + + + + + + fast + + + disable.shadepackage + + + + none + none + + + packagetests-enabled diff --git a/public/external-example/src/main/java/org/mycompany/app/MyExampleWalker.java b/public/external-example/src/main/java/org/mycompany/app/MyExampleWalker.java index 1834c4a4a..8dedbdd59 100644 --- a/public/external-example/src/main/java/org/mycompany/app/MyExampleWalker.java +++ b/public/external-example/src/main/java/org/mycompany/app/MyExampleWalker.java @@ -26,9 +26,9 @@ package org.mycompany.app; import org.broadinstitute.gatk.utils.commandline.Output; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.LocusWalker; import java.io.PrintStream; diff --git a/public/external-example/src/test/java/org/mycompany/app/MyExampleWalkerIntegrationTest.java b/public/external-example/src/test/java/org/mycompany/app/MyExampleWalkerIntegrationTest.java index ee625a46d..c3461f23e 100644 --- a/public/external-example/src/test/java/org/mycompany/app/MyExampleWalkerIntegrationTest.java +++ b/public/external-example/src/test/java/org/mycompany/app/MyExampleWalkerIntegrationTest.java @@ -46,9 +46,13 @@ public class MyExampleWalkerIntegrationTest extends WalkerTest { } private File getResource(String path) throws URISyntaxException { + return new File(publicTestDir, path); + /* + TODO: Enable proper resource extraction from the test jars. For now just use the publicTestDir path. URL resourceUrl = getClass().getResource(path); if (resourceUrl == null) throw new MissingResourceException("Resource not found: " + path, getClass().getSimpleName(), path); return new File(resourceUrl.toURI()); + */ } } diff --git a/public/gatk-engine/pom.xml b/public/gatk-engine/pom.xml index 15ba06ecb..1f59cd1bc 100644 --- a/public/gatk-engine/pom.xml +++ b/public/gatk-engine/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.3 + 3.4-SNAPSHOT ../.. @@ -24,6 +24,22 @@ gatk-utils ${project.version} + + net.java.dev.jets3t + jets3t + + + org.simpleframework + simple-xml + + + + ${project.groupId} + gatk-utils + ${project.version} + test-jar + test + com.google.caliper @@ -34,28 +50,6 @@ - - org.apache.maven.plugins - maven-assembly-plugin - - - example-resources - ${gatk.generate-resources.phase} - - - - - org.apache.maven.plugins - maven-resources-plugin - - - copy-resource-bundle-log4j - prepare-package - - - - org.apache.maven.plugins maven-invoker-plugin diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineExecutable.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineExecutable.java new file mode 100644 index 000000000..a2bb4afd9 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineExecutable.java @@ -0,0 +1,229 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.commandline.ArgumentTypeDescriptor; +import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; +import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.broadinstitute.gatk.engine.filters.ReadFilter; +import org.broadinstitute.gatk.engine.io.stubs.OutputStreamArgumentTypeDescriptor; +import org.broadinstitute.gatk.engine.io.stubs.SAMFileWriterArgumentTypeDescriptor; +import org.broadinstitute.gatk.engine.io.stubs.VCFWriterArgumentTypeDescriptor; +import org.broadinstitute.gatk.engine.phonehome.GATKRunReport; +import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.engine.crypt.CryptUtils; +import org.broadinstitute.gatk.engine.crypt.GATKKey; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.text.ListFileUtils; + +import java.security.PublicKey; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; + +/** + * @author aaron + */ +public abstract class CommandLineExecutable extends CommandLineProgram { + /** + * The actual engine which performs the analysis. + */ + protected GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + + // get the analysis name + public abstract String getAnalysisName(); + + /** + * Gets the GATK argument bundle. + * @return A structure consisting of whatever arguments should be used to initialize the GATK engine. + */ + protected abstract GATKArgumentCollection getArgumentCollection(); + + /** + * A list of all the arguments initially used as sources. + */ + private final Collection argumentSources = new ArrayList(); + + protected static Logger logger = Logger.getLogger(CommandLineExecutable.class); + + /** + * this is the function that the inheriting class can expect to have called + * when the command line system has initialized. + * + * @return the return code to exit the program with + */ + protected int execute() throws Exception { + engine.setParser(parser); + argumentSources.add(this); + + Walker walker = engine.getWalkerByName(getAnalysisName()); + + try { + // Make sure a valid GATK user key is present, if required. + authorizeGATKRun(); + + engine.setArguments(getArgumentCollection()); + + // File lists can require a bit of additional expansion. Set these explicitly by the engine. + final Collection bamFileList=ListFileUtils.unpackBAMFileList(getArgumentCollection().samFiles,parser); + engine.setSAMFileIDs(bamFileList); + if(getArgumentCollection().showFullBamList){ + logger.info(String.format("Adding the following input SAM Files: %s",bamFileList.toString())); + } + + engine.setWalker(walker); + walker.setToolkit(engine); + + Collection filters = engine.createFilters(); + engine.setFilters(filters); + + // load the arguments into the walker / filters. + // TODO: The fact that this extra load call exists here when all the parsing happens at the engine + // TODO: level indicates that we're doing something wrong. Turn this around so that the GATK can drive + // TODO: argument processing. + loadArgumentsIntoObject(walker); + argumentSources.add(walker); + + Collection rodBindings = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser); + engine.setReferenceMetaDataFiles(rodBindings); + + for (ReadFilter filter: filters) { + loadArgumentsIntoObject(filter); + argumentSources.add(filter); + } + + engine.execute(); + generateGATKRunReport(walker); + } catch ( Exception e ) { + generateGATKRunReport(walker, e); + throw e; + } + + // always return 0 + return 0; + } + + /** + * Authorizes this run of the GATK by checking for a valid GATK user key, if required. + * Currently, a key is required only if running with the -et NO_ET or -et STDOUT options. + */ + private void authorizeGATKRun() { + if ( getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.NO_ET || + getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.STDOUT ) { + if ( getArgumentCollection().gatkKeyFile == null ) { + throw new UserException("Running with the -et NO_ET or -et STDOUT option requires a GATK Key file. " + + "Please see " + UserException.PHONE_HOME_DOCS_URL + + " for more information and instructions on how to obtain a key."); + } + else { + PublicKey gatkPublicKey = CryptUtils.loadGATKDistributedPublicKey(); + GATKKey gatkUserKey = new GATKKey(gatkPublicKey, getArgumentCollection().gatkKeyFile); + + if ( ! gatkUserKey.isValid() ) { + throw new UserException.KeySignatureVerificationException(getArgumentCollection().gatkKeyFile); + } + } + } + } + + /** + * Generate the GATK run report for this walker using the current GATKEngine, if -et is enabled. + * This report will be written to either STDOUT or to the run repository, depending on the options + * for -et. + * + * @param e the exception, can be null if no exception occurred + */ + private void generateGATKRunReport(Walker walker, Exception e) { + if ( getArgumentCollection().phoneHomeType != GATKRunReport.PhoneHomeOption.NO_ET ) { + GATKRunReport report = new GATKRunReport(walker, e, engine, getArgumentCollection().phoneHomeType ); + report.postReport(getArgumentCollection().phoneHomeType); + } + } + + /** + * Convenience method for fully parameterized generateGATKRunReport when an exception has + * not occurred + * + * @param walker + */ + private void generateGATKRunReport(Walker walker) { + generateGATKRunReport(walker, null); + } + + /** + * Subclasses of CommandLinePrograms can provide their own types of command-line arguments. + * @return A collection of type descriptors generating implementation-dependent placeholders. + */ + protected Collection getArgumentTypeDescriptors() { + return Arrays.asList( new VCFWriterArgumentTypeDescriptor(engine,System.out,argumentSources), + new SAMFileWriterArgumentTypeDescriptor(engine,System.out), + new OutputStreamArgumentTypeDescriptor(engine,System.out) ); + } + + /** + * GATK can add arguments dynamically based on analysis type. + * + * @return true + */ + @Override + protected boolean canAddArgumentsDynamically() { + return true; + } + + /** + * GATK provides the walker as an argument source. + * @return List of walkers to load dynamically. + */ + @Override + protected Class[] getArgumentSources() { + // No walker info? No plugins. + if (getAnalysisName() == null) return new Class[] {}; + + Collection argumentSources = new ArrayList(); + + Walker walker = engine.getWalkerByName(getAnalysisName()); + engine.setArguments(getArgumentCollection()); + engine.setWalker(walker); + walker.setToolkit(engine); + argumentSources.add(walker.getClass()); + + Collection filters = engine.createFilters(); + for(ReadFilter filter: filters) + argumentSources.add(filter.getClass()); + + Class[] argumentSourcesAsArray = new Class[argumentSources.size()]; + return argumentSources.toArray(argumentSourcesAsArray); + } + + @Override + protected String getArgumentSourceName( Class argumentSource ) { + return engine.getWalkerName((Class)argumentSource); + } + +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineGATK.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineGATK.java new file mode 100644 index 000000000..328960390 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineGATK.java @@ -0,0 +1,370 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import htsjdk.samtools.SAMException; +import htsjdk.tribble.TribbleException; +import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; +import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; +import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; +import org.broadinstitute.gatk.utils.refdata.tracks.FeatureManager; +import org.broadinstitute.gatk.engine.walkers.Attribution; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.help.*; +import org.broadinstitute.gatk.utils.text.TextFormattingUtils; + +import java.util.*; + +/** + * All command line parameters accepted by all tools in the GATK. + * + *

    Info for end users

    + * + *

    This is a list of options and parameters that are generally available to all tools in the GATK.

    + * + *

    There may be a few restrictions, which are indicated in individual argument descriptions. For example the -BQSR + * argument is only meant to be used with a subset of tools, and the -pedigree argument will only be effectively used + * by a subset of tools as well. Some arguments conflict with others, and some conversely are dependent on others. This + * is all indicated in the detailed argument descriptions, so be sure to read those in their entirety rather than just + * skimming the one-line summary in the table.

    + * + *

    Info for developers

    + * + *

    This class is the GATK engine itself, which manages map/reduce data access and runs walkers.

    + * + *

    We run command line GATK programs using this class. It gets the command line args, parses them, and hands the + * gatk all the parsed out information. Pretty much anything dealing with the underlying system should go here; + * the GATK engine should deal with any data related information.

    + */ +@DocumentedGATKFeature(groupName = HelpConstants.DOCS_CAT_ENGINE) +public class CommandLineGATK extends CommandLineExecutable { + /** + * A complete list of tools (sometimes also called walkers because they "walk" through the data to perform analyses) + * is available in the online documentation. + */ + @Argument(fullName = "analysis_type", shortName = "T", doc = "Name of the tool to run") + private String analysisName = null; + + // our argument collection, the collection of command line args we accept + @ArgumentCollection + private GATKArgumentCollection argCollection = new GATKArgumentCollection(); + + /** + * Get pleasing info about the GATK. + * + * @return A list of Strings that contain pleasant info about the GATK. + */ + @Override + protected ApplicationDetails getApplicationDetails() { + return new ApplicationDetails(createApplicationHeader(), + getAttribution(), + ApplicationDetails.createDefaultRunningInstructions(getClass()), + getAdditionalHelp()); + } + + @Override + public String getAnalysisName() { + return analysisName; + } + + @Override + protected GATKArgumentCollection getArgumentCollection() { + return argCollection; + } + + /** + * Required main method implementation. + */ + public static void main(String[] argv) { + try { + CommandLineGATK instance = new CommandLineGATK(); + start(instance, argv); + System.exit(CommandLineProgram.result); // todo -- this is a painful hack + } catch (UserException e) { + exitSystemWithUserError(e); + } catch (TribbleException e) { + // We can generate Tribble Exceptions in weird places when e.g. VCF genotype fields are + // lazy loaded, so they aren't caught elsewhere and made into User Exceptions + exitSystemWithUserError(e); + } catch (SAMException e) { + checkForMaskedUserErrors(e); + exitSystemWithSamError(e); + } catch (OutOfMemoryError e) { + exitSystemWithUserError(new UserException.NotEnoughMemory()); + } catch (Throwable t) { + checkForMaskedUserErrors(t); + exitSystemWithError(t); + } + } + + public static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file"; + public static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files"; + public static final String NO_SPACE_LEFT_ON_DEVICE_ERROR = "No space left on device"; + public static final String DISK_QUOTA_EXCEEDED_ERROR = "Disk quota exceeded"; + + private static void checkForMaskedUserErrors(final Throwable t) { + // masked out of memory error + if ( t instanceof OutOfMemoryError ) + exitSystemWithUserError(new UserException.NotEnoughMemory()); + // masked user error + if ( t instanceof UserException || t instanceof TribbleException ) + exitSystemWithUserError(new UserException(t.getMessage())); + + // no message means no masked error + final String message = t.getMessage(); + if ( message == null ) + return; + + // too many open files error + if ( message.contains("Too many open files") ) + exitSystemWithUserError(new UserException.TooManyOpenFiles()); + + // malformed BAM looks like a SAM file + if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) || message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) ) + exitSystemWithSamError(t); + + // can't close tribble index when writing + if ( message.contains("Unable to close index for") ) + exitSystemWithUserError(new UserException(t.getCause() == null ? message : t.getCause().getMessage())); + + // disk is full + if ( message.contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || message.contains(DISK_QUOTA_EXCEEDED_ERROR) ) + exitSystemWithUserError(new UserException.NoSpaceOnDevice()); + + // masked error wrapped in another one + if ( t.getCause() != null ) + checkForMaskedUserErrors(t.getCause()); + } + + /** + * Creates the a short blurb about the GATK, copyright info, and where to get documentation. + * + * @return The application header. + */ + public static List createApplicationHeader() { + List header = new ArrayList(); + header.add(String.format("The Genome Analysis Toolkit (GATK) v%s, Compiled %s",getVersionNumber(), getBuildTime())); + header.add("Copyright (c) 2010 The Broad Institute"); + header.add("For support and documentation go to " + HelpConstants.BASE_GATK_URL); + return header; + } + + /** + * If the user supplied any additional attribution, return it here. + * @return Additional attribution if supplied by the user. Empty (non-null) list otherwise. + */ + private List getAttribution() { + List attributionLines = new ArrayList(); + + // If no analysis name is present, fill in extra help on the walkers. + WalkerManager walkerManager = engine.getWalkerManager(); + String analysisName = getAnalysisName(); + if(analysisName != null && walkerManager.exists(analysisName)) { + Class walkerType = walkerManager.getWalkerClassByName(analysisName); + if(walkerType.isAnnotationPresent(Attribution.class)) + attributionLines.addAll(Arrays.asList(walkerType.getAnnotation(Attribution.class).value())); + } + return attributionLines; + } + + /** + * Retrieves additional information about GATK walkers. + * the code in HelpFormatter and supply it as a helper to this method. + * + * @return A string summarizing the walkers available in this distribution. + */ + private String getAdditionalHelp() { + String additionalHelp; + + // If no analysis name is present, fill in extra help on the walkers. + WalkerManager walkerManager = engine.getWalkerManager(); + if(analysisName != null && walkerManager.exists(analysisName)) + additionalHelp = getWalkerHelp(walkerManager.getWalkerClassByName(analysisName)); + else + additionalHelp = getAllWalkerHelp(); + + return additionalHelp; + } + + private static final int PACKAGE_INDENT = 1; + private static final int WALKER_INDENT = 3; + private static final String FIELD_SEPARATOR = " "; + + private String getWalkerHelp(Class walkerType) { + // Construct a help string to output details on this walker. + StringBuilder additionalHelp = new StringBuilder(); + Formatter formatter = new Formatter(additionalHelp); + + formatter.format("Available Reference Ordered Data types:%n"); + formatter.format(new FeatureManager().userFriendlyListOfAvailableFeatures()); + formatter.format("%n"); + + formatter.format("For a full description of this walker, see its GATKdocs at:%n"); + formatter.format("%s%n", GATKDocUtils.helpLinksToGATKDocs(walkerType)); + + return additionalHelp.toString(); + } + + /** + * Load in additional help information about all available walkers. + * @return A string representation of the additional help. + */ + private String getAllWalkerHelp() { + // Construct a help string to output available walkers. + StringBuilder additionalHelp = new StringBuilder(); + Formatter formatter = new Formatter(additionalHelp); + + // Get the list of walker names from the walker manager. + WalkerManager walkerManager = engine.getWalkerManager(); + + // Build a list sorted by walker display name. As this information is collected, keep track of the longest + // package / walker name for later formatting. + SortedSet helpText = new TreeSet(new HelpEntryComparator()); + + int longestPackageName = 0; + int longestWalkerName = 0; + for(Map.Entry>> walkersByPackage: walkerManager.getWalkerNamesByPackage(true).entrySet()) { + // Get the display name. + String packageName = walkersByPackage.getKey(); + String packageDisplayName = walkerManager.getPackageDisplayName(walkersByPackage.getKey()); + String packageHelpText = walkerManager.getPackageSummaryText(packageName); + + // Compute statistics about which names is longest. + longestPackageName = Math.max(longestPackageName,packageDisplayName.length()); + + SortedSet walkersInPackage = new TreeSet(new HelpEntryComparator()); + for(Class walkerType: walkersByPackage.getValue()) { + String walkerName = walkerType.getName(); + String walkerDisplayName = walkerManager.getName(walkerType); + String walkerHelpText = walkerManager.getWalkerSummaryText(walkerType); + + longestWalkerName = Math.max(longestWalkerName,walkerManager.getName(walkerType).length()); + + walkersInPackage.add(new HelpEntry(walkerName,walkerDisplayName,walkerHelpText)); + } + + // Dump the walkers into the sorted set. + helpText.add(new HelpEntry(packageName,packageDisplayName,packageHelpText,Collections.unmodifiableSortedSet(walkersInPackage))); + } + + final int headerWidth = Math.max(longestPackageName+PACKAGE_INDENT,longestWalkerName+WALKER_INDENT); + + + for(HelpEntry packageHelp: helpText) { + printDescriptorLine(formatter,PACKAGE_INDENT,packageHelp.displayName,headerWidth,FIELD_SEPARATOR,packageHelp.summary,TextFormattingUtils.DEFAULT_LINE_WIDTH); + + for(HelpEntry walkerHelp: packageHelp.children) + printDescriptorLine(formatter,WALKER_INDENT,walkerHelp.displayName,headerWidth,FIELD_SEPARATOR,walkerHelp.summary,TextFormattingUtils.DEFAULT_LINE_WIDTH); + + // Print a blank line between sets of walkers. + printDescriptorLine(formatter,0,"",headerWidth,FIELD_SEPARATOR,"", TextFormattingUtils.DEFAULT_LINE_WIDTH); + } + + return additionalHelp.toString(); + } + + private void printDescriptorLine(Formatter formatter, + int headerIndentWidth, + String header, + int headerWidth, + String fieldSeparator, + String description, + int lineWidth) { + final int headerPaddingWidth = headerWidth - header.length() - headerIndentWidth; + final int descriptionWidth = lineWidth - fieldSeparator.length() - headerWidth; + List wordWrappedText = TextFormattingUtils.wordWrap(description,descriptionWidth); + + String headerIndentFormatString = headerIndentWidth > 0 ? "%" + headerIndentWidth + "s" : "%s"; + String headerPaddingFormatString = headerPaddingWidth > 0 ? "%" + headerPaddingWidth + "s" : "%s"; + String headerWidthFormatString = headerWidth > 0 ? "%" + headerWidth + "s" : "%s"; + + // Output description line. + formatter.format(headerIndentFormatString + "%s" + headerPaddingFormatString + "%s%s%n", + "", header, "", fieldSeparator, wordWrappedText.size()>0?wordWrappedText.get(0):""); + for(int i = 1; i < wordWrappedText.size(); i++) + formatter.format(headerWidthFormatString + "%s%s%n", "", fieldSeparator, wordWrappedText.get(i)); + } + +} + +/** + * Represents a given help entry; contains a display name, a summary and optionally some children. + */ +class HelpEntry { + public final String uid; + public final String displayName; + public final String summary; + public final SortedSet children; + + /** + * Create a new help entry with the given display name, summary and children. + * @param uid a unique identifier. Usually, the java package. + * @param displayName display name for this help entry. + * @param summary summary for this help entry. + * @param children children for this help entry. + */ + public HelpEntry(String uid, String displayName, String summary, SortedSet children) { + this.uid = uid; + this.displayName = displayName; + this.summary = summary; + this.children = children; + } + + /** + * Create a new help entry with the given display name, summary and children. + * @param uid a unique identifier. Usually, the java package. + * @param displayName display name for this help entry. + * @param summary summary for this help entry. + */ + public HelpEntry(String uid, String displayName, String summary) { + this(uid,displayName,summary,null); + } + +} + +/** + * Compare two help entries by display name. + */ +class HelpEntryComparator implements Comparator { + private static TextFormattingUtils.CaseInsensitiveComparator textComparator = new TextFormattingUtils.CaseInsensitiveComparator(); + + /** + * Compares the order of lhs to rhs, not taking case into account. + * @param lhs First object to compare. + * @param rhs Second object to compare. + * @return 0 if objects are identical; -1 if lhs is before rhs, 1 if rhs is before lhs. Nulls are treated as after everything else. + */ + public int compare(HelpEntry lhs, HelpEntry rhs) { + if(lhs == null && rhs == null) return 0; + if(lhs == null || lhs.displayName.equals("")) return 1; + if(rhs == null || rhs.displayName.equals("")) return -1; + return lhs.displayName.equals(rhs.displayName) ? textComparator.compare(lhs.uid,rhs.uid) : textComparator.compare(lhs.displayName,rhs.displayName); + } + + +} \ No newline at end of file diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GATKVCFUtils.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GATKVCFUtils.java new file mode 100644 index 000000000..da3763f1a --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GATKVCFUtils.java @@ -0,0 +1,421 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.tribble.index.*; +import htsjdk.tribble.index.interval.IntervalTreeIndex; +import htsjdk.tribble.index.linear.LinearIndex; +import org.apache.log4j.Logger; +import htsjdk.tribble.Feature; +import htsjdk.tribble.index.interval.IntervalIndexCreator; +import htsjdk.tribble.index.linear.LinearIndexCreator; +import htsjdk.tribble.index.tabix.TabixFormat; +import htsjdk.tribble.index.tabix.TabixIndexCreator; +import htsjdk.tribble.readers.LineIterator; +import htsjdk.tribble.readers.PositionalBufferedStream; +import org.broadinstitute.gatk.utils.commandline.ArgumentTypeDescriptor; +import org.broadinstitute.gatk.utils.commandline.RodBinding; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.utils.collections.Pair; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.*; +import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType; + +import java.io.*; +import java.lang.reflect.Field; +import java.util.*; + + +/** + * A set of GATK-specific static utility methods for common operations on VCF files/records. + */ +public class GATKVCFUtils { + + /** + * Constructor access disallowed...static utility methods only! + */ + private GATKVCFUtils() { } + + public static final Logger logger = Logger.getLogger(GATKVCFUtils.class); + public final static String GATK_COMMAND_LINE_KEY = "GATKCommandLine"; + + public final static GATKVCFIndexType DEFAULT_INDEX_TYPE = GATKVCFIndexType.DYNAMIC_SEEK; // by default, optimize for seek time. All indices prior to Nov 2013 used this type. + public final static Integer DEFAULT_INDEX_PARAMETER = -1; // the default DYNAMIC_SEEK does not use a parameter + // as determined experimentally Nov-Dec 2013 + public final static GATKVCFIndexType DEFAULT_GVCF_INDEX_TYPE = GATKVCFIndexType.LINEAR; + public final static Integer DEFAULT_GVCF_INDEX_PARAMETER = 128000; + + // GVCF file extension + public final static String GVCF_EXT = "g.vcf"; + + // Message for using the deprecated --variant_index_type or --variant_index_parameter arguments. + public final static String DEPRECATED_INDEX_ARGS_MSG = "Naming your output file using the .g.vcf extension will automatically set the appropriate values " + + " for --variant_index_type and --variant_index_parameter"; + + /** + * Gets the appropriately formatted header for a VCF file describing this GATK run + * + * @param header the existing VCFHeader that we will be adding this command line argument header line to. Existing + * command line argument header lines will be used to generate a unique header line key. + * @param engine the GATK engine that holds the walker name, GATK version, and other information + * @param argumentSources contains information on the argument values provided to the GATK for converting to a + * command line string. Should be provided from the data in the parsing engine. Can be + * empty in which case the command line will be the empty string. + * @return VCF header line describing this run of the GATK. + */ + public static VCFHeaderLine getCommandLineArgumentHeaderLine(final VCFHeader header, final GenomeAnalysisEngine engine, final Collection argumentSources) { + if ( engine == null ) throw new IllegalArgumentException("engine cannot be null"); + if ( argumentSources == null ) throw new IllegalArgumentException("argumentSources cannot be null"); + + final Map attributes = new LinkedHashMap<>(); + attributes.put("ID", engine.getWalkerName()); + attributes.put("Version", CommandLineGATK.getVersionNumber()); + final Date date = new Date(); + attributes.put("Date", date.toString()); + attributes.put("Epoch", Long.toString(date.getTime())); + attributes.put("CommandLineOptions", engine.createApproximateCommandLineArgumentString(argumentSources.toArray())); + + // in case the walker name contains space, remove any spaces + String key = getCommandLineKey(header, engine.getWalkerName().replaceAll("\\s", "")); + return new VCFSimpleHeaderLine(key, attributes); + } + + // create a unique command line argument header line key. This method will look for existing + // keys using the same walker name and append a count after it to make it unique. + private static String getCommandLineKey(final VCFHeader header, final String walkerName) { + final Iterator existingMetaDataIterator = header.getMetaDataInInputOrder().iterator(); + + // the command line argument keys are in the format GATK_COMMAND_LINE_KEY.(walker name) + final String searchKey = String.format("%s.%s", GATK_COMMAND_LINE_KEY, walkerName); + + int commandLineKeyCount = 0; + VCFHeaderLine line; + while ( existingMetaDataIterator.hasNext() ) { + line = existingMetaDataIterator.next(); + // if we find another key that starts with the same text as the walker + if ( line.getKey().startsWith(searchKey) ) + commandLineKeyCount++; + } + + // if there are no existing keys with this same walker name, then just return the + // GATK_COMMAND_LINE_KEY.(walker name) format + if ( commandLineKeyCount == 0 ) + return searchKey; + // otherwise append the count associated with this new command (existing + 1) + else + return String.format("%s.%d", searchKey, commandLineKeyCount+1); + } + + public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, List> rodBindings) { + // Collect the eval rod names + final Set names = new TreeSet(); + for ( final RodBinding evalRod : rodBindings ) + names.add(evalRod.getName()); + return getVCFHeadersFromRods(toolkit, names); + } + + public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit) { + return getVCFHeadersFromRods(toolkit, (Collection)null); + } + + public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, Collection rodNames) { + Map data = new HashMap(); + + // iterate to get all of the sample names + List dataSources = toolkit.getRodDataSources(); + for ( ReferenceOrderedDataSource source : dataSources ) { + // ignore the rod if it's not in our list + if ( rodNames != null && !rodNames.contains(source.getName()) ) + continue; + + if ( source.getHeader() != null && source.getHeader() instanceof VCFHeader ) + data.put(source.getName(), (VCFHeader)source.getHeader()); + } + + return data; + } + + public static Map getVCFHeadersFromRodPrefix(GenomeAnalysisEngine toolkit,String prefix) { + Map data = new HashMap(); + + // iterate to get all of the sample names + List dataSources = toolkit.getRodDataSources(); + for ( ReferenceOrderedDataSource source : dataSources ) { + // ignore the rod if lacks the prefix + if ( ! source.getName().startsWith(prefix) ) + continue; + + if ( source.getHeader() != null && source.getHeader() instanceof VCFHeader ) + data.put(source.getName(), (VCFHeader)source.getHeader()); + } + + return data; + } + + /** + * Gets the header fields from all VCF rods input by the user + * + * @param toolkit GATK engine + * + * @return a set of all fields + */ + public static Set getHeaderFields(GenomeAnalysisEngine toolkit) { + return getHeaderFields(toolkit, null); + } + + /** + * Gets the header fields from all VCF rods input by the user + * + * @param toolkit GATK engine + * @param rodNames names of rods to use, or null if we should use all possible ones + * + * @return a set of all fields + */ + public static Set getHeaderFields(GenomeAnalysisEngine toolkit, Collection rodNames) { + + // keep a map of sample name to occurrences encountered + TreeSet fields = new TreeSet(); + + // iterate to get all of the sample names + List dataSources = toolkit.getRodDataSources(); + for ( ReferenceOrderedDataSource source : dataSources ) { + // ignore the rod if it's not in our list + if ( rodNames != null && !rodNames.contains(source.getName()) ) + continue; + + if ( source.getRecordType().equals(VariantContext.class)) { + VCFHeader header = (VCFHeader)source.getHeader(); + if ( header != null ) + fields.addAll(header.getMetaDataInSortedOrder()); + } + } + + return fields; + } + + /** + * Add / replace the contig header lines in the VCFHeader with the information in the GATK engine + * + * @param header the header to update + * @param engine the GATK engine containing command line arguments and the master sequence dictionary + */ + public static VCFHeader withUpdatedContigs(final VCFHeader header, final GenomeAnalysisEngine engine) { + return VCFUtils.withUpdatedContigs(header, engine.getArguments().referenceFile, engine.getMasterSequenceDictionary()); + } + + /** + * Create and return an IndexCreator + * @param type + * @param parameter + * @param outFile + * @return + */ + public static IndexCreator getIndexCreator(GATKVCFIndexType type, int parameter, File outFile) { + return getIndexCreator(type, parameter, outFile, null); + } + + /** + * Create and return an IndexCreator + * @param type + * @param parameter + * @param outFile + * @param sequenceDictionary + * @return + */ + public static IndexCreator getIndexCreator(GATKVCFIndexType type, int parameter, File outFile, SAMSequenceDictionary sequenceDictionary) { + if (ArgumentTypeDescriptor.isCompressed(outFile.toString())) { + if (type != GATKVCFUtils.DEFAULT_INDEX_TYPE || parameter != GATKVCFUtils.DEFAULT_INDEX_PARAMETER) + logger.warn("Creating Tabix index for " + outFile + ", ignoring user-specified index type and parameter"); + + if (sequenceDictionary == null) + return new TabixIndexCreator(TabixFormat.VCF); + else + return new TabixIndexCreator(sequenceDictionary, TabixFormat.VCF); + } + + IndexCreator idxCreator; + switch (type) { + case DYNAMIC_SEEK: idxCreator = new DynamicIndexCreator(outFile, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); break; + case DYNAMIC_SIZE: idxCreator = new DynamicIndexCreator(outFile, IndexFactory.IndexBalanceApproach.FOR_SIZE); break; + case LINEAR: idxCreator = new LinearIndexCreator(outFile, parameter); break; + case INTERVAL: idxCreator = new IntervalIndexCreator(outFile, parameter); break; + default: throw new IllegalArgumentException("Unknown IndexCreator type: " + type); + } + + return idxCreator; + } + + /** + * Read all of the VCF records from source into memory, returning the header and the VariantContexts + * + * SHOULD ONLY BE USED FOR UNIT/INTEGRATION TESTING PURPOSES! + * + * @param source the file to read, must be in VCF4 format + * @return + * @throws java.io.IOException + */ + public static Pair> readVCF(final File source) throws IOException { + // read in the features + final List vcs = new ArrayList(); + final VCFCodec codec = new VCFCodec(); + PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source)); + final LineIterator vcfSource = codec.makeSourceFromStream(pbs); + try { + final VCFHeader vcfHeader = (VCFHeader) codec.readActualHeader(vcfSource); + + while (vcfSource.hasNext()) { + final VariantContext vc = codec.decode(vcfSource); + if ( vc != null ) + vcs.add(vc); + } + + return new Pair>(vcfHeader, vcs); + } finally { + codec.close(vcfSource); + } + } + + /** + * Check if the two indices are equivalent + * + * @param thisIndex index + * @param otherIndex index + * @return true if indices are equivalent, false otherwise. + */ + public static boolean equivalentAbstractIndices(AbstractIndex thisIndex, AbstractIndex otherIndex){ + return thisIndex.getVersion() == otherIndex.getVersion() && + thisIndex.getIndexedFile().equals(otherIndex.getIndexedFile()) && + thisIndex.getIndexedFileSize() == otherIndex.getIndexedFileSize() && + thisIndex.getIndexedFileMD5().equals(otherIndex.getIndexedFileMD5()) && + thisIndex.getFlags() == otherIndex.getFlags(); + } + + /** + * Check if the two indices are equivalent for a chromosome + * + * @param thisIndex index + * @param otherIndex index + * @param chr chromosome + * @return true if indices are equivalent, false otherwise. + * @throws NoSuchFieldException if index does not exist for a chromosome + * @throws IllegalAccessException if index does not exist for a chromosome + */ + public static boolean equivalentLinearIndices(LinearIndex thisIndex, LinearIndex otherIndex, String chr) throws NoSuchFieldException, IllegalAccessException { + htsjdk.tribble.index.linear.LinearIndex.ChrIndex thisChr = (htsjdk.tribble.index.linear.LinearIndex.ChrIndex)getChrIndex(thisIndex, chr); + htsjdk.tribble.index.linear.LinearIndex.ChrIndex otherChr = (htsjdk.tribble.index.linear.LinearIndex.ChrIndex)getChrIndex(otherIndex, chr); + + return thisChr.getName().equals(otherChr.getName()) && + //thisChr.getTotalSize() == otherChr.getTotalSize() && TODO: why does this differ? + thisChr.getNFeatures() == otherChr.getNFeatures() && + thisChr.getNBlocks() == otherChr.getNBlocks(); + } + + /** + * Check if the two interval indices are equivalent for a chromosome + * + * @param thisIndex interval index + * @param otherIndex interval index + * @param chr chromosome + * @return true if indices are equivalent, false otherwise. + * @throws NoSuchFieldException if index does not exist for a chromosome + * @throws IllegalAccessException if index does not exist for a chromosome + */ + public static boolean equivalentIntervalIndices(IntervalTreeIndex thisIndex, IntervalTreeIndex otherIndex, String chr) throws NoSuchFieldException, IllegalAccessException { + htsjdk.tribble.index.interval.IntervalTreeIndex.ChrIndex thisChr = (htsjdk.tribble.index.interval.IntervalTreeIndex.ChrIndex)getChrIndex(thisIndex, chr); + htsjdk.tribble.index.interval.IntervalTreeIndex.ChrIndex otherChr = (htsjdk.tribble.index.interval.IntervalTreeIndex.ChrIndex)getChrIndex(otherIndex, chr); + + // TODO: compare trees? + return thisChr.getName().equals(otherChr.getName()); + } + + /** + * Get index for a chromosome + * + * @param index index + * @param chr chromosome + * @return index for the chromosome + * @throws NoSuchFieldException if index does not exist for a chromosome + * @throws IllegalAccessException if index does not exist for a chromosome + */ + public static ChrIndex getChrIndex(AbstractIndex index, String chr) throws NoSuchFieldException, IllegalAccessException { + Field f = AbstractIndex.class.getDeclaredField("chrIndices"); + f.setAccessible(true); + LinkedHashMap chrIndices = (LinkedHashMap) f.get(index); + return chrIndices.get(chr); + } + + /** + * Make an IndexCreator + * + * @param variantIndexType variant indexing strategy + * @param variantIndexParameter variant indexing parameter + * @param outputFile output variant file + * @param sequenceDictionary collection of SAM sequence records + * @return IndexCreator + */ + public static IndexCreator makeIndexCreator(final GATKVCFIndexType variantIndexType, final int variantIndexParameter, final File outputFile, final SAMSequenceDictionary sequenceDictionary) { + /* + * If using the index arguments, log a warning. + * If the genotype file has the GCVF extension (.g.vcf), use the default GCVF indexing. + * Otherwise, use the default index type and parameter. + */ + GATKVCFIndexType indexType = DEFAULT_INDEX_TYPE; + int indexParameter = DEFAULT_INDEX_PARAMETER; + if (usingNonDefaultIndexingArguments(variantIndexType, variantIndexParameter)) { + indexType = variantIndexType; + indexParameter = variantIndexParameter; + logger.warn(DEPRECATED_INDEX_ARGS_MSG); + } else if (outputFile.getName().endsWith("." + GVCF_EXT)) { + indexType = DEFAULT_GVCF_INDEX_TYPE; + indexParameter = DEFAULT_GVCF_INDEX_PARAMETER; + } + + return getIndexCreator(indexType, indexParameter, outputFile, sequenceDictionary); + } + + /** + * Check if not using the default indexing arguments' values + * + * @param variantIndexType variant indexing strategy + * @param variantIndexParameter variant indexing parameter + * @return true if the index type or parameter are not the default values, false otherwise + */ + public static boolean usingNonDefaultIndexingArguments(final GATKVCFIndexType variantIndexType, final int variantIndexParameter) { + return variantIndexType != GATKVCFUtils.DEFAULT_INDEX_TYPE || variantIndexParameter != GATKVCFUtils.DEFAULT_INDEX_PARAMETER; + } + + /** + * Check if using the GCVF indexing arguments' values + * + * @param variantIndexType variant indexing strategy + * @param variantIndexParameter variant indexing parameter + * @return true if the index type and parameter are the default GVCF values, false otherwise + */ + public static boolean usingGVCFIndexingArguments(final GATKVCFIndexType variantIndexType, final int variantIndexParameter) { + return variantIndexType == GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE && variantIndexParameter == GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER; + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java new file mode 100644 index 000000000..3117d3c57 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java @@ -0,0 +1,1325 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import com.google.java.contract.Ensures; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.variant.vcf.VCFConstants; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; +import org.broadinstitute.gatk.engine.filters.DisableableReadFilter; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.engine.datasources.reads.*; +import org.broadinstitute.gatk.engine.datasources.reference.ReferenceDataSource; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingMethod; +import org.broadinstitute.gatk.engine.executive.MicroScheduler; +import org.broadinstitute.gatk.engine.filters.FilterManager; +import org.broadinstitute.gatk.engine.filters.ReadFilter; +import org.broadinstitute.gatk.engine.filters.ReadGroupBlackListFilter; +import org.broadinstitute.gatk.engine.io.OutputTracker; +import org.broadinstitute.gatk.engine.io.stubs.Stub; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; +import org.broadinstitute.gatk.engine.iterators.ReadTransformersMode; +import org.broadinstitute.gatk.engine.phonehome.GATKRunReport; +import org.broadinstitute.gatk.utils.io.ReferenceBacked; +import org.broadinstitute.gatk.utils.refdata.tracks.IndexDictionaryUtils; +import org.broadinstitute.gatk.utils.refdata.tracks.RMDTrackBuilder; +import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.engine.samples.SampleDB; +import org.broadinstitute.gatk.engine.samples.SampleDBBuilder; +import org.broadinstitute.gatk.engine.walkers.*; +import org.broadinstitute.gatk.utils.genotyper.IndexedSampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.*; +import org.broadinstitute.gatk.utils.classloader.PluginManager; +import org.broadinstitute.gatk.utils.commandline.*; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.interval.IntervalUtils; +import org.broadinstitute.gatk.utils.progressmeter.ProgressMeter; +import org.broadinstitute.gatk.engine.recalibration.BQSRArgumentSet; +import org.broadinstitute.gatk.utils.sam.ReadUtils; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.broadinstitute.gatk.utils.text.XReadLines; +import org.broadinstitute.gatk.utils.threading.ThreadEfficiencyMonitor; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; +import java.util.concurrent.TimeUnit; + +import static org.broadinstitute.gatk.utils.DeprecatedToolChecks.getWalkerDeprecationInfo; +import static org.broadinstitute.gatk.utils.DeprecatedToolChecks.isDeprecatedWalker; + +/** + * A GenomeAnalysisEngine that runs a specified walker. + */ +public class GenomeAnalysisEngine { + /** + * our log, which we want to capture anything from this class + */ + private static Logger logger = Logger.getLogger(GenomeAnalysisEngine.class); + public static final long NO_RUNTIME_LIMIT = -1; + + /** + * The GATK command-line argument parsing code. + */ + private ParsingEngine parsingEngine; + + /** + * The genomeLocParser can create and parse GenomeLocs. + */ + private GenomeLocParser genomeLocParser; + + /** + * Accessor for sharded read data. + */ + private SAMDataSource readsDataSource = null; + + /** + * Accessor for sharded reference data. + */ + private ReferenceDataSource referenceDataSource = null; + + /** + * Accessor for sample metadata + */ + private SampleDB sampleDB = new SampleDB(); + + /** + * Accessor for sharded reference-ordered data. + */ + private List rodDataSources; + + // our argument collection + private GATKArgumentCollection argCollection; + + /** + * Collection of intervals used by the engine. + */ + private GenomeLocSortedSet intervals = null; + + /** + * Explicitly assign the interval set to use for this traversal (for unit testing purposes) + * @param intervals set of intervals to use for this traversal + */ + public void setIntervals( GenomeLocSortedSet intervals ) { + this.intervals = intervals; + } + + /** + * Collection of inputs used by the engine. + */ + private Map inputs = new HashMap(); + + /** + * Collection of outputs used by the engine. + */ + private Collection> outputs = new ArrayList>(); + + /** + * Collection of the filters applied to the input data. + */ + private Collection filters; + + /** + * Collection of the read transformers applied to the reads + */ + private List readTransformers; + + /** + * Controls the allocation of threads between CPU vs IO. + */ + private ThreadAllocation threadAllocation; + + private ReadMetrics cumulativeMetrics = null; + + /** + * A currently hacky unique name for this GATK instance + */ + private String myName = "GATK_" + Math.abs(Utils.getRandomGenerator().nextInt()); + + /** + * our walker manager + */ + private final WalkerManager walkerManager = new WalkerManager(); + + private Walker walker; + + public void setWalker(Walker walker) { + this.walker = walker; + } + + /** + * The short name of the current GATK walker as a string + * @return a non-null String + */ + public String getWalkerName() { + return getWalkerName(walker.getClass()); + } + + /** + * A processed collection of SAM reader identifiers. + */ + private Collection samReaderIDs = Collections.emptyList(); + + /** + * Set the SAM/BAM files over which to traverse. + * @param samReaderIDs Collection of ids to use during this traversal. + */ + public void setSAMFileIDs(Collection samReaderIDs) { + this.samReaderIDs = samReaderIDs; + } + + /** + * Collection of reference metadata files over which to traverse. + */ + private Collection referenceMetaDataFiles; + + /** + * The threading efficiency monitor we use in the GATK to monitor our efficiency. + * + * May be null if one isn't active, or hasn't be initialized yet + */ + private ThreadEfficiencyMonitor threadEfficiencyMonitor = null; + + /** + * The global progress meter we are using to track our progress through the genome + */ + private ProgressMeter progressMeter = null; + + /** + * Set the reference metadata files to use for this traversal. + * @param referenceMetaDataFiles Collection of files and descriptors over which to traverse. + */ + public void setReferenceMetaDataFiles(Collection referenceMetaDataFiles) { + this.referenceMetaDataFiles = referenceMetaDataFiles; + } + + /** + * The maximum runtime of this engine, in nanoseconds, set during engine initialization + * from the GATKArgumentCollection command line value + */ + private long runtimeLimitInNanoseconds = -1; + + /** + * Base Quality Score Recalibration helper object + */ + private BQSRArgumentSet bqsrArgumentSet = null; + public BQSRArgumentSet getBQSRArgumentSet() { return bqsrArgumentSet; } + public boolean hasBQSRArgumentSet() { return bqsrArgumentSet != null; } + public void setBaseRecalibration(final GATKArgumentCollection args) { + bqsrArgumentSet = new BQSRArgumentSet(args); + } + + /** + * Actually run the GATK with the specified walker. + * + * @return the value of this traversal. + */ + public Object execute() { + // first thing is to make sure the AWS keys can be decrypted + GATKRunReport.checkAWSAreValid(); + + //HeapSizeMonitor monitor = new HeapSizeMonitor(); + //monitor.start(); + setStartTime(new java.util.Date()); + + final GATKArgumentCollection args = this.getArguments(); + + // validate our parameters + if (args == null) { + throw new ReviewedGATKException("The GATKArgumentCollection passed to GenomeAnalysisEngine cannot be null."); + } + + // validate our parameters + if (this.walker == null) + throw new ReviewedGATKException("The walker passed to GenomeAnalysisEngine cannot be null."); + + // check that active region walkers do not use the downsampling to coverage argument + checkDownSamplingToCoverage(); + + if (args.nonDeterministicRandomSeed) + Utils.resetRandomGenerator(System.currentTimeMillis()); + + // if the use specified an input BQSR recalibration table then enable on the fly recalibration + if (args.BQSR_RECAL_FILE != null) + setBaseRecalibration(args); + + // setup the runtime limits + setupRuntimeLimits(args); + + // Determine how the threads should be divided between CPU vs. IO. + determineThreadAllocation(); + + // Prepare the data for traversal. + initializeDataSources(); + + // initialize and validate the interval list + initializeIntervals(); + validateSuppliedIntervals(); + + // check to make sure that all sequence dictionaries are compatible with the reference's sequence dictionary + validateDataSourcesAgainstReference(readsDataSource, referenceDataSource.getReference(), rodDataSources); + + // initialize sampleDB + initializeSampleDB(); + + // our microscheduler, which is in charge of running everything + MicroScheduler microScheduler = createMicroscheduler(); + threadEfficiencyMonitor = microScheduler.getThreadEfficiencyMonitor(); + + // create temp directories as necessary + initializeTempDirectory(); + + // create the output streams + initializeOutputStreams(microScheduler.getOutputTracker()); + + // Initializing the shard iterator / BAM schedule might take some time, so let the user know vaguely what's going on + logger.info("Preparing for traversal" + + (readsDataSource.getReaderIDs().size() > 0 ? String.format(" over %d BAM files", readsDataSource.getReaderIDs().size()) : "")); + Iterable shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals); + logger.info("Done preparing for traversal"); + + // execute the microscheduler, storing the results + return microScheduler.execute(this.walker, shardStrategy); + + //monitor.stop(); + //logger.info(String.format("Maximum heap size consumed: %d",monitor.getMaxMemoryUsed())); + + //return result; + } + + /** + * Retrieves an instance of the walker based on the walker name. + * + * @param walkerName Name of the walker. Must not be null. If the walker cannot be instantiated, an exception will be thrown. + * @return An instance of the walker. + */ + public Walker getWalkerByName(String walkerName) { + try { + return walkerManager.createByName(walkerName); + } catch ( UserException e ) { + if ( isDeprecatedWalker(walkerName) ) { + e = new UserException.DeprecatedWalker(walkerName, getWalkerDeprecationInfo(walkerName)); + } + throw e; + } + } + + /** + * Gets the name of a given walker type. + * @param walkerType Type of walker. + * @return Name of the walker. + */ + public String getWalkerName(Class walkerType) { + return walkerManager.getName(walkerType); + } + + public String getName() { + return myName; + } + + /** + * Gets a list of the filters to associate with the given walker. Will NOT initialize the engine with this filters; + * the caller must handle that directly. + * @return A collection of available filters. + */ + public Collection createFilters() { + final List filters = new LinkedList<>(); + + // First add the user requested filters + if (this.getArguments().readGroupBlackList != null && !this.getArguments().readGroupBlackList.isEmpty()) + filters.add(new ReadGroupBlackListFilter(this.getArguments().readGroupBlackList)); + for(final String filterName: this.getArguments().readFilters) + filters.add(this.getFilterManager().createByName(filterName)); + + // now add the walker default filters. This ordering is critical important if + // users need to apply filters that fix up reads that would be removed by default walker filters + filters.addAll(WalkerManager.getReadFilters(walker,this.getFilterManager())); + + // disable user-specified read filters, if allowed + for(final String filterName: this.getArguments().disabledReadFilters) { + ReadFilter filterToDisable = this.getFilterManager().createByName(filterName); + if (! (filterToDisable instanceof DisableableReadFilter)) + throw new IllegalStateException(filterToDisable + " cannot be disabled"); + + // so we're not trying to modify the list we're iterating over + List filtersCopy = new ArrayList<>(filters); + for (ReadFilter filter : filtersCopy) { + if (filter.getClass() == filterToDisable.getClass()) + filters.remove(filter); + } + } + + return Collections.unmodifiableList(filters); + } + + /** + * Returns a list of active, initialized read transformers + * + * @param walker the walker we need to apply read transformers too + */ + public void initializeReadTransformers(final Walker walker) { + // keep a list of the active read transformers sorted based on priority ordering + List activeTransformers = new ArrayList(); + + final ReadTransformersMode overrideMode = WalkerManager.getWalkerAnnotation(walker, ReadTransformersMode.class); + final ReadTransformer.ApplicationTime overrideTime = overrideMode != null ? overrideMode.ApplicationTime() : null; + + final PluginManager pluginManager = new PluginManager(ReadTransformer.class); + + for ( final ReadTransformer transformer : pluginManager.createAllTypes() ) { + transformer.initialize(overrideTime, this, walker); + if ( transformer.enabled() ) + activeTransformers.add(transformer); + } + + setReadTransformers(activeTransformers); + } + + public List getReadTransformers() { + return readTransformers; + } + + /* + * Sanity checks that incompatible read transformers are not active together (and throws an exception if they are). + * + * @param readTransformers the active read transformers + */ + protected void checkActiveReadTransformers(final List readTransformers) { + if ( readTransformers == null ) + throw new IllegalArgumentException("read transformers cannot be null"); + + ReadTransformer sawMustBeFirst = null; + ReadTransformer sawMustBeLast = null; + + for ( final ReadTransformer r : readTransformers ) { + if ( r.getOrderingConstraint() == ReadTransformer.OrderingConstraint.MUST_BE_FIRST ) { + if ( sawMustBeFirst != null ) + throw new UserException.IncompatibleReadFiltersException(sawMustBeFirst.toString(), r.toString()); + sawMustBeFirst = r; + } else if ( r.getOrderingConstraint() == ReadTransformer.OrderingConstraint.MUST_BE_LAST ) { + if ( sawMustBeLast != null ) + throw new UserException.IncompatibleReadFiltersException(sawMustBeLast.toString(), r.toString()); + sawMustBeLast = r; + } + } + } + + protected void setReadTransformers(final List readTransformers) { + if ( readTransformers == null ) + throw new ReviewedGATKException("read transformers cannot be null"); + + // sort them in priority order + Collections.sort(readTransformers, new ReadTransformer.ReadTransformerComparator()); + + // make sure we don't have an invalid set of active read transformers + checkActiveReadTransformers(readTransformers); + + this.readTransformers = readTransformers; + } + + /** + * Parse out the thread allocation from the given command-line argument. + */ + private void determineThreadAllocation() { + if ( argCollection.numberOfDataThreads < 1 ) throw new UserException.BadArgumentValue("num_threads", "cannot be less than 1, but saw " + argCollection.numberOfDataThreads); + if ( argCollection.numberOfCPUThreadsPerDataThread < 1 ) throw new UserException.BadArgumentValue("num_cpu_threads", "cannot be less than 1, but saw " + argCollection.numberOfCPUThreadsPerDataThread); + if ( argCollection.numberOfIOThreads < 0 ) throw new UserException.BadArgumentValue("num_io_threads", "cannot be less than 0, but saw " + argCollection.numberOfIOThreads); + + this.threadAllocation = new ThreadAllocation(argCollection.numberOfDataThreads, + argCollection.numberOfCPUThreadsPerDataThread, + argCollection.numberOfIOThreads, + argCollection.monitorThreadEfficiency); + } + + public int getTotalNumberOfThreads() { + return this.threadAllocation == null ? 1 : threadAllocation.getTotalNumThreads(); + } + + /** + * Allow subclasses and others within this package direct access to the walker manager. + * @return The walker manager used by this package. + */ + protected WalkerManager getWalkerManager() { + return walkerManager; + } + + /** + * setup a microscheduler + * + * @return a new microscheduler + */ + private MicroScheduler createMicroscheduler() { + // Temporarily require all walkers to have a reference, even if that reference is not conceptually necessary. + if ((walker instanceof ReadWalker || walker instanceof DuplicateWalker || walker instanceof ReadPairWalker) && + this.getArguments().referenceFile == null) { + throw new UserException.CommandLineException("Read-based traversals require a reference file but none was given"); + } + + return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),threadAllocation); + } + + protected DownsamplingMethod getDownsamplingMethod() { + GATKArgumentCollection argCollection = this.getArguments(); + + DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod(); + DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker); + + DownsamplingMethod method = commandLineMethod != null ? commandLineMethod : walkerMethod; + checkCompatibilityWithWalker(method, walker); + return method; + } + + private static void checkCompatibilityWithWalker( DownsamplingMethod method, Walker walker ) { + // Refactored from DownsamplingMethod + final DownsampleType type = method.type; + final Integer toCoverage = method.toCoverage; + final boolean isLocusTraversal = walker instanceof LocusWalker || walker instanceof ActiveRegionWalker; + + if ( isLocusTraversal && type == DownsampleType.ALL_READS && toCoverage != null ) { + throw new UserException("Downsampling to coverage with the ALL_READS method for locus-based traversals (eg., LocusWalkers) is not currently supported (though it is supported for ReadWalkers)."); + } + + // For locus traversals, ensure that the dcov value (if present) is not problematically low + if ( isLocusTraversal && type != DownsampleType.NONE && toCoverage != null && + toCoverage < DownsamplingMethod.MINIMUM_SAFE_COVERAGE_TARGET_FOR_LOCUS_BASED_TRAVERSALS ) { + throw new UserException(String.format("Locus-based traversals (ie., Locus and ActiveRegion walkers) require " + + "a minimum -dcov value of %d when downsampling to coverage. Values less " + + "than this can produce problematic downsampling artifacts while providing " + + "only insignificant improvements in memory usage in most cases.", + DownsamplingMethod.MINIMUM_SAFE_COVERAGE_TARGET_FOR_LOCUS_BASED_TRAVERSALS)); + } + } + + protected void setDownsamplingMethod(DownsamplingMethod method) { + argCollection.setDownsamplingMethod(method); + } + + protected boolean includeReadsWithDeletionAtLoci() { + return walker.includeReadsWithDeletionAtLoci(); + } + + /** + * Verifies that the supplied set of reads files mesh with what the walker says it requires; + * also makes sure that list of SAM files specified on the command line is not empty and contains + * no duplicates. + */ + protected void validateSuppliedReads() { + GATKArgumentCollection arguments = this.getArguments(); + final Boolean samFilesArePresent = (arguments.samFiles != null && !arguments.samFiles.isEmpty()); + + // Check what the walker says is required against what was provided on the command line. + if (WalkerManager.isRequired(walker, DataSource.READS) && !samFilesArePresent) + throw new ArgumentException("Walker requires reads but none were provided."); + + // Check what the walker says is allowed against what was provided on the command line. + if (samFilesArePresent && !WalkerManager.isAllowed(walker, DataSource.READS)) + throw new ArgumentException("Walker does not allow reads but reads were provided."); + + //Make sure SAM list specified by the user (if necessary) is not empty + if(WalkerManager.isRequired(walker, DataSource.READS) && samFilesArePresent && samReaderIDs.isEmpty() ) { + throw new UserException("The list of input files does not contain any BAM files."); + } + + // Make sure no SAM files were specified multiple times by the user. + checkForDuplicateSamFiles(); + } + + /** + * Check that active region walkers do not use the downsampling to coverage argument + * + * @throws UserException if an active region walker is using the -dcov or --downsample_to_coverage downsampling arguments + */ + private void checkDownSamplingToCoverage() { + if (argCollection.downsampleCoverage != null && walker instanceof ActiveRegionWalker) { + throw new UserException.CommandLineException("Cannot use -dcov or --downsample_to_coverage for ActiveRegionWalkers, use another downsampling argument"); + } + } + + /** + * Checks whether there are SAM files that appear multiple times in the fully unpacked list of + * SAM files (samReaderIDs). If there are, throws an ArgumentException listing the files in question. + */ + protected void checkForDuplicateSamFiles() { + Set encounteredSamFiles = new HashSet(); + Set duplicateSamFiles = new LinkedHashSet(); + + for ( SAMReaderID samFile : samReaderIDs ) { + if ( encounteredSamFiles.contains(samFile) ) { + duplicateSamFiles.add(samFile.getSamFilePath()); + } + else { + encounteredSamFiles.add(samFile); + } + } + + if ( duplicateSamFiles.size() > 0 ) { + throw new UserException("The following BAM files appear multiple times in the list of input files: " + + duplicateSamFiles + " BAM files may be specified at most once."); + } + + } + + /** + * Verifies that the supplied reference file mesh with what the walker says it requires. + */ + protected void validateSuppliedReference() { + GATKArgumentCollection arguments = this.getArguments(); + // Check what the walker says is required against what was provided on the command line. + // TODO: Temporarily disabling WalkerManager.isRequired check on the reference because the reference is always required. + if (/*WalkerManager.isRequired(walker, DataSource.REFERENCE) &&*/ arguments.referenceFile == null) + throw new ArgumentException("Walker requires a reference but none was provided."); + + // Check what the walker says is allowed against what was provided on the command line. + if (arguments.referenceFile != null && !WalkerManager.isAllowed(walker, DataSource.REFERENCE)) + throw new ArgumentException("Walker does not allow a reference but one was provided."); + } + + protected void validateSuppliedIntervals() { + // Only read walkers support '-L unmapped' intervals. Trap and validate any other instances of -L unmapped. + if(!(walker instanceof ReadWalker)) { + GenomeLocSortedSet intervals = getIntervals(); + if(intervals != null && getIntervals().contains(GenomeLoc.UNMAPPED)) + throw new ArgumentException("Interval list specifies unmapped region. Only read walkers may include the unmapped region."); + } + + // If intervals is non-null and empty at this point, it means that the list of intervals to process + // was filtered down to an empty set (eg., the user specified something like -L chr1 -XL chr1). Since + // this was very likely unintentional, the user should be informed of this. Note that this is different + // from the case where intervals == null, which indicates that there were no interval arguments. + if ( intervals != null && intervals.isEmpty() ) { + logger.warn("The given combination of -L and -XL options results in an empty set. No intervals to process."); + } + + // TODO: add a check for ActiveRegion walkers to prevent users from passing an entire contig/chromosome + } + + /** + * Get the sharding strategy given a driving data source. + * + * @param readsDataSource readsDataSource + * @param drivingDataSource Data on which to shard. + * @param intervals intervals + * @return the sharding strategy + */ + protected Iterable getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) { + ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null); + DownsamplingMethod downsamplingMethod = readsDataSource != null ? readsDataSource.getReadsInfo().getDownsamplingMethod() : null; + ReferenceDataSource referenceDataSource = this.getReferenceDataSource(); + + // If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition. + if(!readsDataSource.isEmpty()) { + if(!readsDataSource.hasIndex() && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM)) + throw new UserException.CommandLineException("Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported."); + if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM) + throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available."); + + if(walker instanceof LocusWalker) { + if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); + if(intervals == null) + return readsDataSource.createShardIteratorOverMappedReads(new LocusShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer()); + } + else if(walker instanceof ActiveRegionWalker) { + if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); + if(intervals == null) + return readsDataSource.createShardIteratorOverMappedReads(new ActiveRegionShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new ActiveRegionShardBalancer()); + } + else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) { + // Apply special validation to read pair walkers. + if(walker instanceof ReadPairWalker) { + if(readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname) + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker."); + if(intervals != null && !intervals.isEmpty()) + throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); + } + + if(intervals == null) + return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(intervals, new ReadShardBalancer()); + } + else + throw new ReviewedGATKException("Unable to determine walker type for walker " + walker.getClass().getName()); + } + else { + // TODO -- Determine what the ideal shard size should be here. Matt suggested that a multiple of 16K might work well + // TODO -- (because of how VCF indexes work), but my empirical experience has been simply that the larger the shard + // TODO -- size the more efficient the traversal (at least for RODWalkers). Keeping the previous values for now. [EB] + final int SHARD_SIZE = walker instanceof RodWalker ? 1000000 : 100000; + if(intervals == null) + return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE); + else + return referenceDataSource.createShardsOverIntervals(readsDataSource,intervals,SHARD_SIZE); + } + } + + protected boolean flashbackData() { + return walker instanceof ReadWalker; + } + + /** + * Create the temp directory if it doesn't exist. + */ + private void initializeTempDirectory() { + File tempDir = new File(System.getProperty("java.io.tmpdir")); + if (!tempDir.exists() && !tempDir.mkdirs()) + throw new UserException.BadTmpDir("Unable to create directory"); + } + + /** + * Initialize the output streams as specified by the user. + * + * @param outputTracker the tracker supplying the initialization data. + */ + private void initializeOutputStreams(final OutputTracker outputTracker) { + for (final Map.Entry input : getInputs().entrySet()) { + setReferenceFile(input.getValue()); + outputTracker.addInput(input.getKey(), input.getValue()); + } + for (final Stub stub : getOutputs()) { + setReferenceFile(stub); + stub.processArguments(argCollection); + outputTracker.addOutput(stub); + } + + outputTracker.prepareWalker(walker, getArguments().strictnessLevel); + } + + private void setReferenceFile(final Object object) { + if (object instanceof ReferenceBacked) { + ((ReferenceBacked)object).setReferenceFile(argCollection.referenceFile); + } + } + + public ReferenceDataSource getReferenceDataSource() { + return referenceDataSource; + } + + public GenomeLocParser getGenomeLocParser() { + return genomeLocParser; + } + + /** + * Manage lists of filters. + */ + private final FilterManager filterManager = new FilterManager(); + + private Date startTime = null; // the start time for execution + + public void setParser(ParsingEngine parsingEngine) { + this.parsingEngine = parsingEngine; + } + + /** + * Explicitly set the GenomeLocParser, for unit testing. + * @param genomeLocParser GenomeLocParser to use. + */ + public void setGenomeLocParser(GenomeLocParser genomeLocParser) { + this.genomeLocParser = genomeLocParser; + } + + /** + * Sets the start time when the execute() function was last called + * @param startTime the start time when the execute() function was last called + */ + protected void setStartTime(Date startTime) { + this.startTime = startTime; + } + + /** + * @return the start time when the execute() function was last called + */ + public Date getStartTime() { + return startTime; + } + + /** + * Setup the intervals to be processed + */ + protected void initializeIntervals() { + intervals = IntervalUtils.parseIntervalArguments(this.referenceDataSource.getReference(), argCollection.intervalArguments); + } + + /** + * Add additional, externally managed IO streams for inputs. + * + * @param argumentSource Field into which to inject the value. + * @param value Instance to inject. + */ + public void addInput(ArgumentSource argumentSource, Object value) { + inputs.put(argumentSource, value); + } + + /** + * Add additional, externally managed IO streams for output. + * + * @param stub Instance to inject. + */ + public void addOutput(Stub stub) { + outputs.add(stub); + } + + /** + * Returns the tag associated with a given command-line argument. + * @param key Object for which to inspect the tag. + * @return Tags object associated with the given key, or an empty Tag structure if none are present. + */ + public Tags getTags(Object key) { + return parsingEngine.getTags(key); + } + + protected void initializeDataSources() { + logger.info("Strictness is " + argCollection.strictnessLevel); + + validateSuppliedReference(); + setReferenceDataSource(argCollection.referenceFile); + + validateSuppliedReads(); + initializeReadTransformers(walker); + + final Map sampleRenameMap = argCollection.sampleRenameMappingFile != null ? + loadSampleRenameMap(argCollection.sampleRenameMappingFile) : + null; + + readsDataSource = createReadsDataSource(argCollection,genomeLocParser,referenceDataSource.getReference(), sampleRenameMap); + + for (ReadFilter filter : filters) + filter.initialize(this); + + // set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference + rodDataSources = getReferenceOrderedDataSources(referenceMetaDataFiles,referenceDataSource.getReference().getSequenceDictionary(), + genomeLocParser,argCollection.unsafe,sampleRenameMap); + } + + /** + * Purely for testing purposes. Do not use unless you absolutely positively know what you are doing (or + * need to absolutely positively kill everyone in the room) + * @param dataSource + */ + public void setReadsDataSource(final SAMDataSource dataSource) { + this.readsDataSource = dataSource; + } + + /** + * Entry-point function to initialize the samples database from input data and pedigree arguments + */ + private void initializeSampleDB() { + SampleDBBuilder sampleDBBuilder = new SampleDBBuilder(this, argCollection.pedigreeValidationType); + sampleDBBuilder.addSamplesFromSAMHeader(getSAMFileHeader()); + sampleDBBuilder.addSamplesFromSampleNames(SampleUtils.getUniqueSamplesFromRods(this)); + sampleDBBuilder.addSamplesFromPedigreeFiles(argCollection.pedigreeFiles); + sampleDBBuilder.addSamplesFromPedigreeStrings(argCollection.pedigreeStrings); + sampleDB = sampleDBBuilder.getFinalSampleDB(); + } + + /** + * Gets a unique identifier for the reader sourcing this read. + * @param read Read to examine. + * @return A unique identifier for the source file of this read. Exception if not found. + */ + public SAMReaderID getReaderIDForRead(final SAMRecord read) { + return getReadsDataSource().getReaderID(read); + } + + /** + * Gets the source file for this read. + * @param id Unique identifier determining which input file to use. + * @return The source filename for this read. + */ + public File getSourceFileForReaderID(final SAMReaderID id) { + return getReadsDataSource().getSAMFile(id); + } + + /** + * Now that all files are open, validate the sequence dictionaries of the reads vs. the reference vrs the reference ordered data (if available). + * + * @param reads Reads data source. + * @param reference Reference data source. + * @param rods a collection of the reference ordered data tracks + */ + private void validateDataSourcesAgainstReference(SAMDataSource reads, ReferenceSequenceFile reference, Collection rods) { + if ((reads.isEmpty() && (rods == null || rods.isEmpty())) || reference == null ) + return; + + // Compile a set of sequence names that exist in the reference file. + SAMSequenceDictionary referenceDictionary = reference.getSequenceDictionary(); + + if (!reads.isEmpty()) { + // Compile a set of sequence names that exist in the BAM files. + SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary(); + + if (readsDictionary.size() == 0) { + logger.info("Reads file is unmapped. Skipping validation against reference."); + return; + } + + // compare the reads to the reference + SequenceDictionaryUtils.validateDictionaries(logger, getArguments().unsafe, "reads", readsDictionary, + "reference", referenceDictionary, true, intervals); + } + + for (ReferenceOrderedDataSource rod : rods) + IndexDictionaryUtils.validateTrackSequenceDictionary(rod.getName(), rod.getSequenceDictionary(), referenceDictionary, getArguments().unsafe); + } + + /** + * Gets a data source for the given set of reads. + * + * @param argCollection arguments + * @param genomeLocParser parser + * @param refReader reader + * @return A data source for the given set of reads. + */ + private SAMDataSource createReadsDataSource(final GATKArgumentCollection argCollection, final GenomeLocParser genomeLocParser, + final IndexedFastaSequenceFile refReader, final Map sampleRenameMap) { + DownsamplingMethod downsamplingMethod = getDownsamplingMethod(); + + // Synchronize the method back into the collection so that it shows up when + // interrogating for the downsampling method during command line recreation. + setDownsamplingMethod(downsamplingMethod); + + logger.info(downsamplingMethod); + + if (argCollection.removeProgramRecords && argCollection.keepProgramRecords) + throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options"); + + boolean removeProgramRecords = argCollection.removeProgramRecords || walker.getClass().isAnnotationPresent(RemoveProgramRecords.class); + + if (argCollection.keepProgramRecords) + removeProgramRecords = false; + + final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker; + + return new SAMDataSource( + argCollection.referenceFile, + samReaderIDs, + threadAllocation, + argCollection.numberOfBAMFileHandles, + genomeLocParser, + argCollection.useOriginalBaseQualities, + argCollection.strictnessLevel, + argCollection.readBufferSize, + downsamplingMethod, + new ValidationExclusion(Arrays.asList(argCollection.unsafe)), + filters, + readTransformers, + includeReadsWithDeletionAtLoci(), + argCollection.defaultBaseQualities, + removeProgramRecords, + keepReadsInLIBS, + sampleRenameMap, + argCollection.intervalArguments.intervalMerging); + } + + /** + * Loads a user-provided sample rename map file for use in on-the-fly sample renaming into an in-memory + * HashMap. This file must consist of lines with two whitespace-separated fields, the second of which + * may contain whitespace: + * + * absolute_path_to_file new_sample_name + * + * The engine will verify that each file contains data from only one sample when the on-the-fly sample + * renaming feature is being used. Note that this feature works only with bam and vcf files. + * + * @param sampleRenameMapFile sample rename map file from which to load data + * @return a HashMap containing the contents of the map file, with the keys being the input file paths and + * the values being the new sample names. + */ + protected Map loadSampleRenameMap( final File sampleRenameMapFile ) { + logger.info("Renaming samples from input files on-the-fly using mapping file " + sampleRenameMapFile.getAbsolutePath()); + + final Map sampleRenameMap = new HashMap<>((int)sampleRenameMapFile.length() / 50); + + try { + for ( final String line : new XReadLines(sampleRenameMapFile) ) { + final String[] tokens = line.split("\\s+", 2); + + if ( tokens.length != 2 ) { + throw new UserException.MalformedFile(sampleRenameMapFile, + String.format("Encountered a line with %s fields instead of the required 2 fields. Line was: %s", + tokens.length, line)); + } + + final File inputFile = new File(tokens[0]); + final String newSampleName = tokens[1].trim(); + + if (newSampleName.contains(VCFConstants.FIELD_SEPARATOR)) { + throw new UserException.MalformedFile(sampleRenameMapFile, String.format( + "Encountered illegal sample name; sample names may not include the VCF field delimiter (%s). Sample name: %s; line: %s", + VCFConstants.FIELD_SEPARATOR, + newSampleName, + line + )); + } + + if ( ! inputFile.isAbsolute() ) { + throw new UserException.MalformedFile(sampleRenameMapFile, "Input file path not absolute at line: " + line); + } + + final String inputFilePath = inputFile.getAbsolutePath(); + + if ( sampleRenameMap.containsKey(inputFilePath) ) { + throw new UserException.MalformedFile(sampleRenameMapFile, + String.format("Input file %s appears more than once", inputFilePath)); + } + + sampleRenameMap.put(inputFilePath, newSampleName); + } + } + catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(sampleRenameMapFile, e); + } + + return sampleRenameMap; + } + + + /** + * Opens a reference sequence file paired with an index. Only public for testing purposes + * + * @param refFile Handle to a reference sequence file. Non-null. + */ + public void setReferenceDataSource(File refFile) { + this.referenceDataSource = new ReferenceDataSource(refFile); + genomeLocParser = new GenomeLocParser(referenceDataSource.getReference()); + } + + /** + * Open the reference-ordered data sources. + * + * @param referenceMetaDataFiles collection of RMD descriptors to load and validate. + * @param sequenceDictionary GATK-wide sequnce dictionary to use for validation. + * @param genomeLocParser to use when creating and validating GenomeLocs. + * @param validationExclusionType potentially indicate which validations to include / exclude. + * @param sampleRenameMap map of file -> new sample name used when doing on-the-fly sample renaming + * + * @return A list of reference-ordered data sources. + */ + private List getReferenceOrderedDataSources(final Collection referenceMetaDataFiles, + final SAMSequenceDictionary sequenceDictionary, + final GenomeLocParser genomeLocParser, + final ValidationExclusion.TYPE validationExclusionType, + final Map sampleRenameMap) { + final RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, validationExclusionType, + getArguments().disableAutoIndexCreationAndLockingWhenReadingRods, + sampleRenameMap); + + final List dataSources = new ArrayList(); + for (RMDTriplet fileDescriptor : referenceMetaDataFiles) + dataSources.add(new ReferenceOrderedDataSource(fileDescriptor, + builder, + sequenceDictionary, + genomeLocParser, + flashbackData())); + + return dataSources; + } + + /** + * Returns the SAM File Header from the input reads' data source file + * @return the SAM File Header from the input reads' data source file + */ + public SAMFileHeader getSAMFileHeader() { + return readsDataSource.getHeader(); + } + + public boolean lenientVCFProcessing() { + return ValidationExclusion.lenientVCFProcessing(argCollection.unsafe); + } + + /** + * Returns the unmerged SAM file header for an individual reader. + * @param reader The reader. + * @return Header for that reader or null if not available. + */ + public SAMFileHeader getSAMFileHeader(SAMReaderID reader) { + return readsDataSource == null ? null : readsDataSource.getHeader(reader); + } + + /** + * Returns an ordered list of the unmerged SAM file headers known to this engine. + * @return list of header for each input SAM file, in command line order + */ + public List getSAMFileHeaders() { + final List headers = new ArrayList(); + for ( final SAMReaderID id : getReadsDataSource().getReaderIDs() ) { + headers.add(getReadsDataSource().getHeader(id)); + } + return headers; + } + + /** + * Gets the master sequence dictionary for this GATK engine instance + * @return a never-null dictionary listing all of the contigs known to this engine instance + */ + public SAMSequenceDictionary getMasterSequenceDictionary() { + return getReferenceDataSource().getReference().getSequenceDictionary(); + } + + /** + * Returns data source object encapsulating all essential info and handlers used to traverse + * reads; header merger, individual file readers etc can be accessed through the returned data source object. + * + * @return the reads data source + */ + public SAMDataSource getReadsDataSource() { + return this.readsDataSource; + } + + /** + * Sets the collection of GATK main application arguments. + * + * @param argCollection the GATK argument collection + */ + public void setArguments(GATKArgumentCollection argCollection) { + this.argCollection = argCollection; + } + + /** + * Gets the collection of GATK main application arguments. + * + * @return the GATK argument collection + */ + public GATKArgumentCollection getArguments() { + return this.argCollection; + } + + /** + * Get the list of intervals passed to the engine. + * @return List of intervals, or null if no intervals are in use + */ + public GenomeLocSortedSet getIntervals() { + return this.intervals; + } + + /** + * Get the list of regions of the genome being processed. If the user + * requested specific intervals, return those, otherwise return regions + * corresponding to the entire genome. Never returns null. + * + * @return a non-null set of intervals being processed + */ + @Ensures("result != null") + public GenomeLocSortedSet getRegionsOfGenomeBeingProcessed() { + if ( getIntervals() == null ) + // if we don't have any intervals defined, create intervals from the reference itself + return GenomeLocSortedSet.createSetFromSequenceDictionary(getReferenceDataSource().getReference().getSequenceDictionary()); + else + return getIntervals(); + } + + /** + * Gets the list of filters employed by this engine. + * @return Collection of filters (actual instances) used by this engine. + */ + public Collection getFilters() { + return this.filters; + } + + /** + * Sets the list of filters employed by this engine. + * @param filters Collection of filters (actual instances) used by this engine. + */ + public void setFilters(Collection filters) { + this.filters = filters; + } + + /** + * Gets the filter manager for this engine. + * @return filter manager for this engine. + */ + protected FilterManager getFilterManager() { + return filterManager; + } + + /** + * Gets the input sources for this engine. + * @return input sources for this engine. + */ + protected Map getInputs() { + return inputs; + } + + /** + * Gets the output stubs for this engine. + * @return output stubs for this engine. + */ + protected Collection> getOutputs() { + return outputs; + } + + /** + * Returns data source objects encapsulating all rod data; + * individual rods can be accessed through the returned data source objects. + * + * @return the rods data sources, never {@code null}. + */ + public List getRodDataSources() { + return this.rodDataSources; + } + + /** + * Gets cumulative metrics about the entire run to this point. + * Returns a clone of this snapshot in time. + * @return cumulative metrics about the entire run at this point. ReadMetrics object is a unique instance and is + * owned by the caller; the caller can do with the object what they wish. + */ + public ReadMetrics getCumulativeMetrics() { + // todo -- probably shouldn't be lazy + if ( cumulativeMetrics == null ) + cumulativeMetrics = readsDataSource == null ? new ReadMetrics() : readsDataSource.getCumulativeReadMetrics(); + return cumulativeMetrics; + } + + /** + * Return the global ThreadEfficiencyMonitor, if there is one + * + * @return the monitor, or null if none is active + */ + public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() { + return threadEfficiencyMonitor; + } + + // ------------------------------------------------------------------------------------- + // + // code for working with Samples database + // + // ------------------------------------------------------------------------------------- + + public SampleDB getSampleDB() { + return this.sampleDB; + } + + public Map getApproximateCommandLineArguments(Object... argumentProviders) { + return CommandLineUtils.getApproximateCommandLineArguments(parsingEngine,argumentProviders); + } + + public String createApproximateCommandLineArgumentString(Object... argumentProviders) { + return CommandLineUtils.createApproximateCommandLineArgumentString(parsingEngine,argumentProviders); + } + + // ------------------------------------------------------------------------------------- + // + // code for working with progress meter + // + // ------------------------------------------------------------------------------------- + + /** + * Register the global progress meter with this engine + * + * Calling this function more than once will result in an IllegalStateException + * + * @param meter a non-null progress meter + */ + public void registerProgressMeter(final ProgressMeter meter) { + if ( meter == null ) throw new IllegalArgumentException("Meter cannot be null"); + if ( progressMeter != null ) throw new IllegalStateException("Progress meter already set"); + + progressMeter = meter; + } + + /** + * Get the progress meter being used by this engine. May be null if no meter has been registered yet + * @return a potentially null pointer to the progress meter + */ + public ProgressMeter getProgressMeter() { + return progressMeter; + } + + /** + * Does the current runtime in unit exceed the runtime limit, if one has been provided? + * + * @return false if not limit was requested or if runtime <= the limit, true otherwise + */ + public boolean exceedsRuntimeLimit() { + if ( progressMeter == null ) + // not yet initialized or not set because of testing + return false; + + if ( getArguments().maxRuntime == NO_RUNTIME_LIMIT ) + return false; + else { + final long runtime = progressMeter.getRuntimeInNanosecondsUpdatedPeriodically(); + if ( runtime < 0 ) throw new IllegalArgumentException("runtime must be >= 0 but got " + runtime); + final long maxRuntimeNano = getRuntimeLimitInNanoseconds(); + return runtime > maxRuntimeNano; + } + } + + /** + * @return the runtime limit in nanoseconds, or -1 if no limit was specified + */ + public long getRuntimeLimitInNanoseconds() { + return runtimeLimitInNanoseconds; + } + + /** + * Setup the runtime limits for this engine, updating the runtimeLimitInNanoseconds + * as appropriate + * + * @param args the GATKArgumentCollection to retrieve our runtime limits from + */ + private void setupRuntimeLimits(final GATKArgumentCollection args) { + if ( args.maxRuntime == NO_RUNTIME_LIMIT ) + runtimeLimitInNanoseconds = -1; + else if (args.maxRuntime < 0 ) + throw new UserException.BadArgumentValue("maxRuntime", "must be >= 0 or == -1 (meaning no limit) but received negative value " + args.maxRuntime); + else { + runtimeLimitInNanoseconds = TimeUnit.NANOSECONDS.convert(args.maxRuntime, args.maxRuntimeUnits); + } + } + + /** + * Returns the sample list including all samples. + * @return never {@code null}. + */ + public SampleList getSampleList() { + return new IndexedSampleList(getSampleDB().getSampleNames()); + } + + /** + * Returns the sample list including samples in read inputs. + * @return never {@code null}. + */ + public SampleList getReadSampleList() { + return new IndexedSampleList(ReadUtils.getSAMFileSamples(getSAMFileHeader())); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/ReadProperties.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/ReadProperties.java new file mode 100644 index 000000000..0f6aee60c --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/ReadProperties.java @@ -0,0 +1,197 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.ValidationStringency; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingMethod; +import org.broadinstitute.gatk.engine.filters.ReadFilter; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; + +import java.util.Collection; +import java.util.List; +/** + * User: hanna + * Date: May 14, 2009 + * Time: 4:06:26 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * A data structure containing information about the reads data sources as well as + * information about how they should be downsampled, sorted, and filtered. + */ +public class ReadProperties { + private final Collection readers; + private final SAMFileHeader header; + private final SAMFileHeader.SortOrder sortOrder; + private final ValidationStringency validationStringency; + private final DownsamplingMethod downsamplingMethod; + private final ValidationExclusion exclusionList; + private final Collection supplementalFilters; + private final List readTransformers; + private final boolean keepUniqueReadListInLIBS; + private final boolean includeReadsWithDeletionAtLoci; + private final boolean useOriginalBaseQualities; + private final byte defaultBaseQualities; + + /** + * Return true if the walker wants to see reads that contain deletions when looking at locus pileups + * + * @return + */ + public boolean includeReadsWithDeletionAtLoci() { + return includeReadsWithDeletionAtLoci; + } + + public boolean keepUniqueReadListInLIBS() { + return keepUniqueReadListInLIBS; + } + + /** + * Gets a list of the files acting as sources of reads. + * @return A list of files storing reads data. + */ + public Collection getSAMReaderIDs() { + return readers; + } + + /** + * Gets the sam file header + * @return the sam file header + */ + public SAMFileHeader getHeader() { + return header; + } + + /** + * Gets the sort order of the reads + * @return the sort order of the reads + */ + public SAMFileHeader.SortOrder getSortOrder() { + return sortOrder; + } + + /** + * How strict should validation be? + * @return Stringency of validation. + */ + public ValidationStringency getValidationStringency() { + return validationStringency; + } + + /** + * Gets the method and parameters used when downsampling reads. + * @return Downsample fraction. + */ + public DownsamplingMethod getDownsamplingMethod() { + return downsamplingMethod; + } + + /** + * Return whether to 'verify' the reads as we pass through them. + * @return Whether to verify the reads. + */ + public ValidationExclusion getValidationExclusionList() { + return exclusionList; + } + + public Collection getSupplementalFilters() { + return supplementalFilters; + } + + + public List getReadTransformers() { + return readTransformers; + } + + /** + * Return whether to use original base qualities. + * @return Whether to use original base qualities. + */ + public boolean useOriginalBaseQualities() { + return useOriginalBaseQualities; + } + + /** + * @return Default base quality value to fill reads missing base quality information. + */ + public byte defaultBaseQualities() { + return defaultBaseQualities; + } + + /** + * Extract the command-line arguments having to do with reads input + * files and store them in an easy-to-work-with package. Constructor + * is package protected. + * @param samFiles list of reads files. + * @param header sam file header. + * @param useOriginalBaseQualities True if original base qualities should be used. + * @param strictness Stringency of reads file parsing. + * @param downsamplingMethod Method for downsampling reads at a given locus. + * @param exclusionList what safety checks we're willing to let slide + * @param supplementalFilters additional filters to dynamically apply. + * @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method + * will explicitly list reads with deletion over the current reference base; otherwise, only observed + * bases will be seen in the pileups, and the deletions will be skipped silently. + * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. + * @param keepUniqueReadListInLIBS If true, we will tell LocusIteratorByState to track the unique reads it sees + * This is really useful for ActiveRegionTraversals + */ + public ReadProperties( Collection samFiles, + SAMFileHeader header, + SAMFileHeader.SortOrder sortOrder, + boolean useOriginalBaseQualities, + ValidationStringency strictness, + DownsamplingMethod downsamplingMethod, + ValidationExclusion exclusionList, + Collection supplementalFilters, + List readTransformers, + boolean includeReadsWithDeletionAtLoci, + byte defaultBaseQualities, + final boolean keepUniqueReadListInLIBS) { + this.readers = samFiles; + this.header = header; + this.sortOrder = sortOrder; + this.validationStringency = strictness; + this.downsamplingMethod = downsamplingMethod == null ? DownsamplingMethod.NONE : downsamplingMethod; + this.exclusionList = exclusionList == null ? new ValidationExclusion() : exclusionList; + this.supplementalFilters = supplementalFilters; + this.readTransformers = readTransformers; + this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci; + this.useOriginalBaseQualities = useOriginalBaseQualities; + this.defaultBaseQualities = defaultBaseQualities; + this.keepUniqueReadListInLIBS = keepUniqueReadListInLIBS; + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/SampleUtils.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/SampleUtils.java new file mode 100644 index 000000000..eb98e0bb4 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/SampleUtils.java @@ -0,0 +1,258 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.gatk.utils.collections.Pair; +import htsjdk.variant.vcf.VCFHeader; +import org.broadinstitute.gatk.utils.text.ListFileUtils; +import org.broadinstitute.gatk.utils.text.XReadLines; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + + +/** + * SampleUtils is a static class (no instantiation allowed!) with some utility methods for getting samples + * quality scores. + * + * @author ebanks + */ +public class SampleUtils { + /** + * Private constructor. No instantiating this class! + */ + private SampleUtils() {} + + /** + * Gets all of the unique sample names from all VCF rods input by the user + * + * @param toolkit GATK engine + * + * @return the set of unique samples + */ + public static Set getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit) { + return getUniqueSamplesFromRods(toolkit, null); + } + + /** + * Gets all of the unique sample names from the set of provided VCF rod names input by the user + * + * @param toolkit GATK engine + * @param rodNames list of rods to use; if null, uses all VCF rods + * + * @return the set of unique samples + */ + public static Set getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit, Collection rodNames) { + Set samples = new LinkedHashSet<>(); + + for ( VCFHeader header : GATKVCFUtils.getVCFHeadersFromRods(toolkit, rodNames).values() ) + samples.addAll(header.getGenotypeSamples()); + + return samples; + } + + public static Set getRodNamesWithVCFHeader(GenomeAnalysisEngine toolkit, Collection rodNames) { + return GATKVCFUtils.getVCFHeadersFromRods(toolkit, rodNames).keySet(); + } + + public static Set getSampleListWithVCFHeader(GenomeAnalysisEngine toolkit, Collection rodNames) { + return getSampleList(GATKVCFUtils.getVCFHeadersFromRods(toolkit, rodNames)); + } + + public static Set getSampleList(Map headers) { + return getSampleList(headers, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE); + } + + public static Set getSampleList(Map headers, GATKVariantContextUtils.GenotypeMergeType mergeOption) { + Set samples = new TreeSet(); + for ( Map.Entry val : headers.entrySet() ) { + VCFHeader header = val.getValue(); + for ( String sample : header.getGenotypeSamples() ) { + samples.add(GATKVariantContextUtils.mergedSampleName(val.getKey(), sample, mergeOption == GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY)); + } + } + + return samples; + } + + + /** + * + * @param VCF_Headers + * @return false if there are names duplication between the samples names in the VCF headers + */ + public static boolean verifyUniqueSamplesNames(Map VCF_Headers) { + Set samples = new HashSet(); + for ( Map.Entry val : VCF_Headers.entrySet() ) { + VCFHeader header = val.getValue(); + for ( String sample : header.getGenotypeSamples() ) { + if (samples.contains(sample)){ + + return false; + } + samples.add(sample); + } + } + + return true; + } + + /** + * Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap + * (e.g. sampleX.1, sampleX.2, ...) + * When finished, samples contains the uniquified sample names and rodNamesToSampleNames contains a mapping + * from rod/sample pairs to the new uniquified names + * + * @param toolkit GATK engine + * @param samples set to store the sample names + * @param rodNamesToSampleNames mapping of rod/sample pairs to new uniquified sample names + */ + public static void getUniquifiedSamplesFromRods(GenomeAnalysisEngine toolkit, Set samples, Map, String> rodNamesToSampleNames) { + + // keep a map of sample name to occurrences encountered + HashMap sampleOverlapMap = new HashMap(); + + // iterate to get all of the sample names + + for ( Map.Entry pair : GATKVCFUtils.getVCFHeadersFromRods(toolkit).entrySet() ) { + for ( String sample : pair.getValue().getGenotypeSamples() ) + addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, pair.getKey()); + } + } + + private static void addUniqueSample(Set samples, Map sampleOverlapMap, Map, String> rodNamesToSampleNames, String newSample, String rodName) { + + // how many occurrences have we seen so far? + Integer occurrences = sampleOverlapMap.get(newSample); + + // if this is the first one, just add it to the list of samples + if ( occurrences == null ) { + samples.add(newSample); + rodNamesToSampleNames.put(new Pair(rodName, newSample), newSample); + sampleOverlapMap.put(newSample, 1); + } + + // if it's already been seen multiple times, give it a unique suffix and increment the value + else if ( occurrences >= 2 ) { + String uniqueName = newSample + "." + rodName; + samples.add(uniqueName); + rodNamesToSampleNames.put(new Pair(rodName, newSample), uniqueName); + sampleOverlapMap.put(newSample, occurrences + 1); + } + + // if this is the second occurrence of the sample name, uniquify both of them + else { // occurrences == 2 + + // remove the 1st occurrence, uniquify it, and add it back + samples.remove(newSample); + String uniqueName1 = null; + for ( Map.Entry, String> entry : rodNamesToSampleNames.entrySet() ) { + if ( entry.getValue().equals(newSample) ) { + uniqueName1 = newSample + "." + entry.getKey().first; + entry.setValue(uniqueName1); + break; + } + } + samples.add(uniqueName1); + + // add the second one + String uniqueName2 = newSample + "." + rodName; + samples.add(uniqueName2); + rodNamesToSampleNames.put(new Pair(rodName, newSample), uniqueName2); + + sampleOverlapMap.put(newSample, 2); + } + + } + + /** + * Returns a new set of samples, containing a final list of samples expanded from sampleArgs + * + * Each element E of sampleArgs can either be a literal sample name or a file. For each E, + * we try to read a file named E from disk, and if possible all lines from that file are expanded + * into unique sample names. + * + * @param sampleArgs args + * @return samples + */ + public static Set getSamplesFromCommandLineInput(Collection sampleArgs) { + if (sampleArgs != null) { + return ListFileUtils.unpackSet(sampleArgs); + } + + return new HashSet(); + } + + public static Set getSamplesFromCommandLineInput(Collection vcfSamples, Collection sampleExpressions) { + Set samples = ListFileUtils.unpackSet(vcfSamples); + if (sampleExpressions == null) { + return samples; + } else { + return ListFileUtils.includeMatching(samples, sampleExpressions, false); + } + } + + /** + * Given a collection of samples and a collection of regular expressions, generates the set of samples that match each expression + * @param originalSamples list of samples to select samples from + * @param sampleExpressions list of expressions to use for matching samples + * @return the set of samples from originalSamples that satisfy at least one of the expressions in sampleExpressions + */ + public static Collection matchSamplesExpressions (Collection originalSamples, Collection sampleExpressions) { + // Now, check the expressions that weren't used in the previous step, and use them as if they're regular expressions + Set samples = new HashSet(); + if (sampleExpressions != null) { + samples.addAll(ListFileUtils.includeMatching(originalSamples, sampleExpressions, false)); + } + return samples; + } + + /** + * Given a list of files with sample names it reads all files and creates a list of unique samples from all these files. + * @param files list of files with sample names in + * @return a collection of unique samples from all files + */ + public static Collection getSamplesFromFiles (Collection files) { + Set samplesFromFiles = new HashSet(); + if (files != null) { + for (File file : files) { + try { + XReadLines reader = new XReadLines(file); + List lines = reader.readLines(); + for (String line : lines) { + samplesFromFiles.add(line); + } + } catch (FileNotFoundException e) { + throw new UserException.CouldNotReadInputFile(file, e); + } + } + } + return samplesFromFiles; + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/WalkerManager.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/WalkerManager.java new file mode 100644 index 000000000..0660cb015 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/WalkerManager.java @@ -0,0 +1,451 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import org.broadinstitute.gatk.engine.filters.DisableableReadFilter; +import org.broadinstitute.gatk.engine.walkers.*; +import org.broadinstitute.gatk.utils.commandline.Hidden; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingMethod; +import org.broadinstitute.gatk.engine.filters.FilterManager; +import org.broadinstitute.gatk.engine.filters.ReadFilter; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; +import org.broadinstitute.gatk.utils.classloader.PluginManager; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.help.ResourceBundleExtractorDoclet; +import org.broadinstitute.gatk.utils.text.TextFormattingUtils; + +import java.lang.annotation.Annotation; +import java.util.*; + +/** + * Plugin manager that also provides various utilities for inspecting Walkers. + */ +public class WalkerManager extends PluginManager { + + /** + * A collection of help text for walkers and their enclosing packages. + */ + private ResourceBundle helpText; + + public WalkerManager() { + super(Walker.class,"walker",""); + helpText = TextFormattingUtils.GATK_RESOURCE_BUNDLE; + } + + /** + * Get the list of walkers currently available to the GATK, organized + * by package. + * @param visibleWalkersOnly If true, return only the walker names that aren't hidden. + * @return Names of currently available walkers. + */ + public Map>> getWalkerNamesByPackage(boolean visibleWalkersOnly) { + Map>> walkersByPackage = new HashMap>>(); + for(Class walker: getPlugins()) { + if(visibleWalkersOnly && isHidden(walker)) + continue; + + // Extract the name for the package; if the walker is in the unnamed package, use the empty string + String walkerPackage = walker.getPackage() != null ? walker.getPackage().getName() : ""; + if(!walkersByPackage.containsKey(walkerPackage)) + walkersByPackage.put(walkerPackage,new ArrayList>()); + walkersByPackage.get(walkerPackage).add(walker); + } + return Collections.unmodifiableMap(walkersByPackage); + } + + /** + * Gets the display name for a given package. + * @param packageName Fully qualified package name. + * @return A suitable display name for the package. + */ + public String getPackageDisplayName(String packageName) { + // ...try to compute the override from the text of the package name, while accounting for + // unpackaged walkers. + String displayName = packageName.substring(packageName.lastIndexOf('.')+1); + if (displayName.trim().equals("")) displayName = ""; + return displayName; + } + + /** + * Gets the help text associated with a given package name. + * @param packageName Package for which to search for help text. + * @return Package help text, or "" if none exists. + */ + public String getPackageSummaryText(String packageName) { + String key = String.format("%s.%s",packageName, ResourceBundleExtractorDoclet.SUMMARY_TAGLET_NAME); + if(!helpText.containsKey(key)) + return ""; + return helpText.getString(key); + } + + /** + * Gets the summary help text associated with a given walker type. + * @param walkerType Type of walker for which to search for help text. + * @return Walker summary description, or "" if none exists. + */ + public String getWalkerSummaryText(Class walkerType) { + String walkerSummary = String.format("%s.%s",walkerType.getName(), ResourceBundleExtractorDoclet.SUMMARY_TAGLET_NAME); + if(!helpText.containsKey(walkerSummary)) + return ""; + return helpText.getString(walkerSummary); + } + + /** + * Gets the summary help text associated with a given walker type. + * @param walker Walker for which to search for help text. + * @return Walker summary description, or "" if none exists. + */ + public String getWalkerSummaryText(Walker walker) { + return getWalkerSummaryText(walker.getClass()); + } + + /** + * Gets the descriptive help text associated with a given walker type. + * @param walkerType Type of walker for which to search for help text. + * @return Walker full description, or "" if none exists. + */ + public String getWalkerDescriptionText(Class walkerType) { + String walkerDescription = String.format("%s.%s",walkerType.getName(), ResourceBundleExtractorDoclet.DESCRIPTION_TAGLET_NAME); + if(!helpText.containsKey(walkerDescription)) + return ""; + return helpText.getString(walkerDescription); + } + + /** + * Gets the descriptive help text associated with a given walker type. + * @param walker Walker for which to search for help text. + * @return Walker full description, or "" if none exists. + */ + public String getWalkerDescriptionText(Walker walker) { + return getWalkerDescriptionText(walker.getClass()); + } + + /** + * Retrieves the walker class given a walker name. + * @param walkerName Name of the walker. + * @return Class representing the walker. + */ + public Class getWalkerClassByName(String walkerName) { + return getPluginsByName().get(walkerName); + } + + /** + * Rather than use the default exception, return a MalformedWalkerArgumentsException. + * @param errorMessage error message from formatErrorMessage() + * @return - A MalformedWalkerArgumentsException with errorMessage + */ + @Override + protected UserException createMalformedArgumentException(final String errorMessage) { + return new UserException.MalformedWalkerArgumentsException(errorMessage); + } + + /** + * Gets the data source for the provided walker. + * @param walkerClass The class of the walker. + * @return Which type of data source to traverse over...reads or reference? + */ + public static DataSource getWalkerDataSource(Class walkerClass) { + By byDataSource = walkerClass.getAnnotation(By.class); + if( byDataSource == null ) + throw new ReviewedGATKException("Unable to find By annotation for walker class " + walkerClass.getName()); + return byDataSource.value(); + } + + /** + * Gets the data source for the provided walker. + * @param walker The walker. + * @return Which type of data source to traverse over...reads or reference? + */ + public static DataSource getWalkerDataSource(Walker walker) { + return getWalkerDataSource(walker.getClass()); + } + + /** + * Get a list of RODs allowed by the walker. + * @param walkerClass Class of the walker to query. + * @return The list of allowed reference meta data. + */ + public static List getAllowsMetaData(Class walkerClass) { + return Collections.emptyList(); + } + + /** + * Determine whether the given walker supports the given data source. + * @param walkerClass Class of the walker to query. + * @param dataSource Source to check for . + * @return True if the walker forbids this data type. False otherwise. + */ + public static boolean isAllowed(Class walkerClass, DataSource dataSource) { + Allows allowsDataSource = getWalkerAllowed(walkerClass); + + // Allows is less restrictive than requires. If an allows + // clause is not specified, any kind of data is allowed. + if( allowsDataSource == null ) + return true; + + return Arrays.asList(allowsDataSource.value()).contains(dataSource); + } + + /** + * Determine whether the given walker supports the given data source. + * @param walker Walker to query. + * @param dataSource Source to check for . + * @return True if the walker forbids this data type. False otherwise. + */ + public static boolean isAllowed(Walker walker, DataSource dataSource) { + return isAllowed(walker.getClass(), dataSource); + } + + /** + * Determine whether the given walker supports the given reference ordered data. + * @param walkerClass Class of the walker to query. + * @param rod Source to check. + * @return True if the walker forbids this data type. False otherwise. + */ + public static boolean isAllowed(Class walkerClass, ReferenceOrderedDataSource rod) { + return true; + } + + /** + * Determine whether the given walker supports the given reference ordered data. + * @param walker Walker to query. + * @param rod Source to check. + * @return True if the walker forbids this data type. False otherwise. + */ + public static boolean isAllowed(Walker walker, ReferenceOrderedDataSource rod) { + return isAllowed(walker.getClass(), rod); + } + + /** + * Determine whether the given walker requires the given data source. + * @param walkerClass Class of the walker to query. + * @param dataSource Source to check for. + * @return True if the walker allows this data type. False otherwise. + */ + public static boolean isRequired(Class walkerClass, DataSource dataSource) { + Requires requiresDataSource = getWalkerRequirements(walkerClass); + return Arrays.asList(requiresDataSource.value()).contains(dataSource); + } + + /** + * Determine whether the given walker requires the given data source. + * @param walker Walker to query. + * @param dataSource Source to check for. + * @return True if the walker allows this data type. False otherwise. + */ + public static boolean isRequired(Walker walker, DataSource dataSource) { + return isRequired(walker.getClass(), dataSource); + } + + /** + * Get a list of RODs required by the walker. + * @param walkerClass Class of the walker to query. + * @return The list of required reference meta data. + */ + public static List getRequiredMetaData(Class walkerClass) { + return Collections.emptyList(); + } + + /** + * Get a list of RODs required by the walker. + * @param walker Walker to query. + * @return The list of required reference meta data. + */ + public static List getRequiredMetaData(Walker walker) { + return getRequiredMetaData(walker.getClass()); + } + + /** + * Reports whether this walker type is hidden -- in other words, whether it'll appear in the help output. + * @param walkerType Class to test for visibility. + * @return True if the walker should be hidden. False otherwise. + */ + public static boolean isHidden(Class walkerType) { + return walkerType.isAnnotationPresent(Hidden.class); + } + + /** + * Extracts filters that the walker has requested be run on the dataset. + * @param walkerClass Class of the walker to inspect for filtering requests. + * @param filterManager Manages the creation of filters. + * @return A non-empty list of filters to apply to the reads. + */ + public static List getReadFilters(Class walkerClass, FilterManager filterManager) { + List filters = new ArrayList(); + for(Class filterType: getReadFilterTypes(walkerClass)) + filters.add(filterManager.createFilterByType(filterType)); + return filters; + } + + /** + * Extracts filters that the walker has requested be run on the dataset. + * @param walker Walker to inspect for filtering requests. + * @param filterManager Manages the creation of filters. + * @return A non-empty list of filters to apply to the reads. + */ + public static List getReadFilters(Walker walker, FilterManager filterManager) { + return getReadFilters(walker.getClass(), filterManager); + } + + /** + * Gets the type of downsampling method requested by the walker. If an alternative + * downsampling method is specified on the command-line, the command-line version will + * be used instead. + * @param walker The walker to interrogate. + * @return The downsampling method, as specified by the walker. Null if none exists. + */ + public static DownsamplingMethod getDownsamplingMethod( Walker walker ) { + return getDownsamplingMethod(walker.getClass()); + } + + /** + * Gets the type of downsampling method requested by the walker. If an alternative + * downsampling method is specified on the command-line, the command-line version will + * be used instead. + * @param walkerClass The class of the walker to interrogate. + * @return The downsampling method, as specified by the walker. Null if none exists. + */ + public static DownsamplingMethod getDownsamplingMethod( Class walkerClass ) { + DownsamplingMethod downsamplingMethod = null; + + if( walkerClass.isAnnotationPresent(Downsample.class) ) { + Downsample downsampleParameters = walkerClass.getAnnotation(Downsample.class); + DownsampleType type = downsampleParameters.by(); + Integer toCoverage = downsampleParameters.toCoverage() >= 0 ? downsampleParameters.toCoverage() : null; + Double toFraction = downsampleParameters.toFraction() >= 0.0d ? downsampleParameters.toFraction() : null; + downsamplingMethod = new DownsamplingMethod(type, toCoverage, toFraction); + } + + return downsamplingMethod; + } + + public static T getWalkerAnnotation(final Walker walker, final Class clazz) { + return walker.getClass().getAnnotation(clazz); + } + + public static ReadTransformer.ApplicationTime getBAQApplicationTime(Walker walker) { + return walker.getClass().getAnnotation(BAQMode.class).ApplicationTime(); + } + + /** + * Create a name for this type of walker. + * + * @param walkerType The type of walker. + * @return A name for this type of walker. + */ + @Override + public String getName(Class walkerType) { + String walkerName = ""; + + if (walkerType.getAnnotation(WalkerName.class) != null) + walkerName = ((WalkerName)walkerType.getAnnotation(WalkerName.class)).value().trim(); + else + walkerName = super.getName(walkerType); + + return walkerName; + } + + /** + * Utility to get the requires attribute from the walker. + * Throws an exception if requirements are missing. + * @param walkerClass Class of the walker to query for required data. + * @return Required data attribute. + */ + private static Requires getWalkerRequirements(Class walkerClass) { + Requires requiresDataSource = walkerClass.getAnnotation(Requires.class); + if( requiresDataSource == null ) + throw new ReviewedGATKException( "Unable to find data types required by walker class " + walkerClass.getName()); + return requiresDataSource; + } + + /** + * Utility to get the requires attribute from the walker. + * Throws an exception if requirements are missing. + * @param walker Walker to query for required data. + * @return Required data attribute. + */ + private static Requires getWalkerRequirements(Walker walker) { + return getWalkerRequirements(walker.getClass()); + } + + /** + * Utility to get the forbidden attribute from the walker. + * @param walkerClass Class of the walker to query for required data. + * @return Required data attribute. Null if forbidden info isn't present. + */ + private static Allows getWalkerAllowed(Class walkerClass) { + Allows allowsDataSource = walkerClass.getAnnotation(Allows.class); + return allowsDataSource; + } + + /** + * Utility to get the forbidden attribute from the walker. + * @param walker Walker to query for required data. + * @return Required data attribute. Null if forbidden info isn't present. + */ + private static Allows getWalkerAllowed(Walker walker) { + return getWalkerAllowed(walker.getClass()); + } + + /** + * Gets the list of filtering classes specified as walker annotations. + * @param walkerClass Class of the walker to inspect. + * @return An array of types extending from SamRecordFilter. Will never be null. + */ + public static Collection> getReadFilterTypes(Class walkerClass) { + List> filterTypes = new ArrayList>(); + while(walkerClass != null) { + // Add the read filters in the ReadFilters annotation + if(walkerClass.isAnnotationPresent(ReadFilters.class)) { + for ( Class c : walkerClass.getAnnotation(ReadFilters.class).value() ) { + if( !filterTypes.contains(c) ) + filterTypes.add(c); + } + } + // Remove read filters in the DisabledReadFilters annotation + if(walkerClass.isAnnotationPresent(DisabledReadFilters.class)) { + for ( Class c : walkerClass.getAnnotation(DisabledReadFilters.class).value() ) { + if ( filterTypes.contains(c) ) + filterTypes.remove(c); + } + } + walkerClass = walkerClass.getSuperclass(); + } + return filterTypes; + } + + /** + * Gets the list of filtering classes specified as walker annotations. + * @param walker The walker to inspect. + * @return An array of types extending from SamRecordFilter. Will never be null. + */ + public static Collection> getReadFilterTypes(Walker walker) { + return getReadFilterTypes(walker.getClass()); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/Aligner.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/Aligner.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/Aligner.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/Aligner.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/Alignment.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/Alignment.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/Alignment.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/Alignment.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWAAligner.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWAAligner.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWAAligner.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWAAligner.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWAConfiguration.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWAConfiguration.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWAConfiguration.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWAConfiguration.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWTFiles.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWTFiles.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWTFiles.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWTFiles.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignerTestHarness.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignerTestHarness.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignerTestHarness.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignerTestHarness.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignmentMatchSequence.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignmentMatchSequence.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignmentMatchSequence.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignmentMatchSequence.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignmentState.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignmentState.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignmentState.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignmentState.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/BWAAlignment.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/BWAAlignment.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/BWAAlignment.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/BWAAlignment.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/BWAJavaAligner.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/BWAJavaAligner.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/BWAJavaAligner.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/BWAJavaAligner.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/LowerBound.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/LowerBound.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/LowerBound.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/LowerBound.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/package-info.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/package-info.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/package-info.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/AMBWriter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/AMBWriter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/AMBWriter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/AMBWriter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/ANNWriter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/ANNWriter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/ANNWriter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/ANNWriter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWT.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWT.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWT.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWT.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTReader.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTReader.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTReader.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTReader.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTSupplementaryFileGenerator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTSupplementaryFileGenerator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTSupplementaryFileGenerator.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTSupplementaryFileGenerator.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTWriter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTWriter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTWriter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTWriter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/Bases.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/Bases.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/Bases.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/Bases.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/Counts.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/Counts.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/Counts.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/Counts.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/CreateBWTFromReference.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/CreateBWTFromReference.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/CreateBWTFromReference.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/CreateBWTFromReference.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SequenceBlock.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SequenceBlock.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SequenceBlock.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SequenceBlock.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArray.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArray.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArray.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArray.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArrayReader.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArrayReader.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArrayReader.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArrayReader.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArrayWriter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArrayWriter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArrayWriter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArrayWriter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/BasePackedInputStream.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/BasePackedInputStream.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/BasePackedInputStream.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/BasePackedInputStream.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/BasePackedOutputStream.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/BasePackedOutputStream.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/BasePackedOutputStream.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/BasePackedOutputStream.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/CreatePACFromReference.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/CreatePACFromReference.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/CreatePACFromReference.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/CreatePACFromReference.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/PackUtils.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/PackUtils.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/PackUtils.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/PackUtils.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/UnsignedIntPackedInputStream.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/UnsignedIntPackedInputStream.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/UnsignedIntPackedInputStream.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/UnsignedIntPackedInputStream.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/UnsignedIntPackedOutputStream.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/UnsignedIntPackedOutputStream.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/UnsignedIntPackedOutputStream.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/UnsignedIntPackedOutputStream.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/arguments/DbsnpArgumentCollection.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/DbsnpArgumentCollection.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/arguments/DbsnpArgumentCollection.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/DbsnpArgumentCollection.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java new file mode 100644 index 000000000..4fec3e240 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java @@ -0,0 +1,646 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.arguments; + +import htsjdk.samtools.ValidationStringency; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.utils.commandline.*; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingMethod; +import org.broadinstitute.gatk.engine.phonehome.GATKRunReport; +import org.broadinstitute.gatk.engine.samples.PedigreeValidationType; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.baq.BAQ; +import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType; +import org.broadinstitute.gatk.engine.GATKVCFUtils; + +import java.io.File; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.TimeUnit; + +/** + * @author aaron + * @version 1.0 + */ +public class GATKArgumentCollection { + + /** the constructor */ + public GATKArgumentCollection() { + } + + // parameters and their defaults + /** + * An input file containing sequence data mapped to a reference, in SAM or BAM format, or a text file containing a + * list of input files (with extension .list). Note that the GATK requires an accompanying index for each SAM or + * BAM file. Please see our online documentation for more details on input formatting requirements. + */ + @Input(fullName = "input_file", shortName = "I", doc = "Input file containing sequence data (SAM or BAM)", required = false) + public List samFiles = new ArrayList<>(); + + @Advanced + @Argument(fullName = "showFullBamList",doc="Emit a log entry (level INFO) containing the full list of sequence data files to be included in the analysis (including files inside .bam.list files).") + public Boolean showFullBamList = false; + + @Advanced + @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false, minValue = 0) + public Integer readBufferSize = null; + + // -------------------------------------------------------------------------------------------------------------- + // + // GATKRunReport options + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * By default, GATK generates a run report that is uploaded to a cloud-based service. This report contains basic + * statistics about the run (which tool was used, whether the run was successful etc.) that help us for debugging + * and development. Up to version 3.3-0 the run report contains a record of the username and hostname associated + * with the run, but it does **NOT** contain any information that could be used to identify patient data. + * Nevertheless, if your data is subject to stringent confidentiality clauses (no outside communication) or if your + * run environment is not connected to the internet, you can disable the reporting system by seeting this option to + * "NO_ET". You will also need to request a key using the online request form on our website (see FAQs). + */ + @Argument(fullName = "phone_home", shortName = "et", doc="Run reporting mode", required = false) + public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.AWS; + /** + * Please see the "phone_home" argument below and the online documentation FAQs for more details on the key system + * and how to request a key. + */ + @Argument(fullName = "gatk_key", shortName = "K", doc="GATK key file required to run with -et NO_ET", required = false) + public File gatkKeyFile = null; + + /** + * The GATKRunReport supports tagging GATK runs with an arbitrary tag that can be + * used to group together runs during later analysis (as of GATK 2.2) . One use of this capability is to tag + * runs as GATK performance tests, so that the performance of the GATK over time can be assessed from the logs + * directly. + * + * Note that the tags do not conform to any ontology, so you are free to use any tags that you might find + * meaningful. + */ + @Argument(fullName = "tag", shortName = "tag", doc="Tag to identify this GATK run as part of a group of runs", required = false) + public String tag = "NA"; + + // -------------------------------------------------------------------------------------------------------------- + // + // General features + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Reads that fail the specified filters will not be used in the analysis. Multiple filters can be specified separately, + * e.g. you can do -rf MalformedRead -rf BadCigar and so on. Available read filters are listed in the online tool + * documentation. Note that the read name format is e.g. MalformedReadFilter, but at the command line the filter + * name should be given without the Filter suffix; e.g. -rf MalformedRead (NOT -rf MalformedReadFilter, which is not + * recognized by the program). Note also that some read filters are applied by default for some analysis tools; this + * is specified in each tool's documentation. The default filters can only be disabled if they are DisableableReadFilters. + */ + @Argument(fullName = "read_filter", shortName = "rf", doc = "Filters to apply to reads before analysis", required = false) + public final List readFilters = new ArrayList<>(); + + @Argument(fullName = "disable_read_filter", shortName = "drf", doc = "Read filters to disable", required = false) + public final List disabledReadFilters = new ArrayList<>(); + + @ArgumentCollection + public IntervalArgumentCollection intervalArguments = new IntervalArgumentCollection(); + /** + * The reference genome against which the sequence data was mapped. The GATK requires an index file and a dictionary + * file accompanying the reference (please see the online documentation FAQs for more details on these files). Although + * this argument is indicated as being optional, almost all GATK tools require a reference in order to run. + * Note also that while GATK can in theory process genomes from any organism with any number of chromosomes or contigs, + * it is not designed to process draft genome assemblies and performance will decrease as the number of contigs in + * the reference increases. We strongly discourage the use of unfinished genome assemblies containing more than a few + * hundred contigs. Contig numbers in the thousands will most probably cause memory-related crashes. + */ + @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false) + public File referenceFile = null; + /** + * If this flag is enabled, the random numbers generated will be different in every run, causing GATK to behave non-deterministically. + */ + @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Use a non-deterministic random seed", required = false) + public boolean nonDeterministicRandomSeed = false; + /** + * To be used in the testing framework where dynamic parallelism can result in differing numbers of calls to the random generator. + */ + @Hidden + @Argument(fullName = "disableDithering",doc="Completely eliminates randomized dithering from rank sum tests.") + public boolean disableDithering = false; + /** + * This will truncate the run but without exiting with a failure. By default the value is interpreted in minutes, but this can be changed with the maxRuntimeUnits argument. + */ + @Argument(fullName = "maxRuntime", shortName = "maxRuntime", doc="Stop execution cleanly as soon as maxRuntime has been reached", required = false) + public long maxRuntime = GenomeAnalysisEngine.NO_RUNTIME_LIMIT; + + @Argument(fullName = "maxRuntimeUnits", shortName = "maxRuntimeUnits", doc="Unit of time used by maxRuntime", required = false) + public TimeUnit maxRuntimeUnits = TimeUnit.MINUTES; + + // -------------------------------------------------------------------------------------------------------------- + // + // Downsampling Arguments + // + // -------------------------------------------------------------------------------------------------------------- + /** + * There are several ways to downsample reads, i.e. to remove reads from the pile of reads that will be used for analysis. + * See the documentation of the individual downsampling options for details on how they work. Note that many GATK tools + * specify a default downsampling type and target, but this behavior can be overridden from the command line using the + * downsampling arguments. + */ + @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of read downsampling to employ at a given locus", required = false) + public DownsampleType downsamplingType = null; + /** + * Reads will be downsampled so the specified fraction remains; e.g. if you specify -dfrac 0.25, three-quarters of + * the reads will be removed, and the remaining one quarter will be used in the analysis. This method of downsampling + * is truly unbiased and random. It is typically used to simulate the effect of generating different amounts of + * sequence data for a given sample. For example, you can use this in a pilot experiment to evaluate how much target + * coverage you need to aim for in order to obtain enough coverage in all loci of interest. + */ + @Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction of reads to downsample to", required = false, minValue = 0.0, maxValue = 1.0) + public Double downsampleFraction = null; + + /** + * The principle of this downsampling type is to downsample reads to a given capping threshold coverage. Its purpose is to + * get rid of excessive coverage, because above a certain depth, having additional data is not informative and imposes + * unreasonable computational costs. The downsampling process takes two different forms depending on the type of + * analysis it is used with. + * + * For locus-based traversals (LocusWalkers like UnifiedGenotyper and ActiveRegionWalkers like HaplotypeCaller), + * downsample_to_coverage controls the maximum depth of coverage at each locus. For read-based traversals + * (ReadWalkers like BaseRecalibrator), it controls the maximum number of reads sharing the same alignment start + * position. For ReadWalkers you will typically need to use much lower dcov values than you would with LocusWalkers + * to see an effect. Note that this downsampling option does not produce an unbiased random sampling from all available + * reads at each locus: instead, the primary goal of the to-coverage downsampler is to maintain an even representation + * of reads from all alignment start positions when removing excess coverage. For a truly unbiased random sampling of + * reads, use -dfrac instead. Also note that the coverage target is an approximate goal that is not guaranteed to be + * met exactly: the downsampling algorithm will under some circumstances retain slightly more or less coverage than + * requested. + */ + @Argument(fullName = "downsample_to_coverage", shortName = "dcov", + doc = "Target coverage threshold for downsampling to coverage", + required = false, minValue = 0) + public Integer downsampleCoverage = null; + + /** + * Gets the downsampling method explicitly specified by the user. If the user didn't specify + * a default downsampling mechanism, return the default. + * @return The explicitly specified downsampling mechanism, or the default if none exists. + */ + public DownsamplingMethod getDownsamplingMethod() { + if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null ) + return null; + + return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction); + } + + /** + * Set the downsampling method stored in the argument collection so that it is read back out when interrogating the command line arguments. + * @param method The downsampling mechanism. + */ + public void setDownsamplingMethod(DownsamplingMethod method) { + if (method == null) + throw new IllegalArgumentException("method is null"); + + downsamplingType = method.type; + downsampleCoverage = method.toCoverage; + downsampleFraction = method.toFraction; + } + + // -------------------------------------------------------------------------------------------------------------- + // + // BAQ arguments + // + // -------------------------------------------------------------------------------------------------------------- + @Advanced + @Argument(fullName = "baq", shortName="baq", doc="Type of BAQ calculation to apply in the engine", required = false) + public BAQ.CalculationMode BAQMode = BAQ.CalculationMode.OFF; + /** + * Phred-scaled gap open penalty for BAQ calculation. Although the default value is 40, a value of 30 may be better for whole genome call sets. + */ + @Advanced + @Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty", required = false, minValue = 0) + public double BAQGOP = BAQ.DEFAULT_GOP; + + // -------------------------------------------------------------------------------------------------------------- + // + // refactor NDN cigar string arguments + // + // -------------------------------------------------------------------------------------------------------------- + /** + * Some RNAseq aligners that use a known transcriptome resource (such as TopHat2) produce NDN elements in read CIGARS + * when a small exon is entirely deleted during transcription, which ends up looking like [exon1]NDN[exon3]. These + * rarely happen, but when they do they cause GATK to fail with an error. Setting this flag tells the GATK to + * reduce "NDN" to a simpler CIGAR representation with one N element (with total length of the three refactored + * elements). From the point of view of variant calling, there is no meaningful difference between the two + * representations. + */ + @Argument(fullName = "refactor_NDN_cigar_string", shortName = "fixNDN", doc = "Reduce NDN elements in CIGAR string", required = false) + public boolean REFACTOR_NDN_CIGAR_READS = false; + + // -------------------------------------------------------------------------------------------------------------- + // + // quality encoding checking arguments + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * By default the GATK assumes that base quality scores start at Q0 == ASCII 33 according to the SAM specification. + * However, encoding in some datasets (especially older Illumina ones) starts at Q64. This argument will fix the + * encodings on the fly (as the data is read in) by subtracting 31 from every quality score. Note that this argument should + * NEVER be used by default; you should only use it when you have confirmed that the quality scores in your data are + * not in the correct encoding. + */ + @Argument(fullName = "fix_misencoded_quality_scores", shortName="fixMisencodedQuals", doc="Fix mis-encoded base quality scores", required = false) + public boolean FIX_MISENCODED_QUALS = false; + /** + * This flag tells GATK to ignore warnings when encountering base qualities that are too high and that seemingly + * indicate a problem with the base quality encoding of the BAM file. You should only use this if you really know + * what you are doing; otherwise you could seriously mess up your data and ruin your analysis. + */ + @Argument(fullName = "allow_potentially_misencoded_quality_scores", shortName="allowPotentiallyMisencodedQuals", doc="Ignore warnings about base quality score encoding", required = false) + public boolean ALLOW_POTENTIALLY_MISENCODED_QUALS = false; + /** + * This flag tells GATK to use the original base qualities (that were in the data before BQSR/recalibration) which + * are stored in the OQ tag, if they are present, rather than use the post-recalibration quality scores. If no OQ + * tag is present for a read, the standard qual score will be used. + */ + @Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "Use the base quality scores from the OQ tag", required=false) + public Boolean useOriginalBaseQualities = false; + /** + * If reads are missing some or all base quality scores, this value will be used for all base quality scores. + * By default this is set to -1 to disable default base quality assignment. + */ + @Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "Assign a default base quality", required=false, minValue = 0, maxValue = Byte.MAX_VALUE) + public byte defaultBaseQualities = -1; + + // -------------------------------------------------------------------------------------------------------------- + // + // performance log arguments + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * The file name for the GATK performance log output, or null if you don't want to generate the + * detailed performance logging table. This table is suitable for importing into R or any + * other analysis software that can read tsv files. + */ + @Argument(fullName = "performanceLog", shortName="PF", doc="Write GATK runtime performance log to this file", required = false) + public File performanceLog = null; + + // -------------------------------------------------------------------------------------------------------------- + // + // BQSR arguments + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Enables on-the-fly recalibrate of base qualities, intended primarily for use with BaseRecalibrator and PrintReads + * (see Best Practices workflow documentation). The covariates tables are produced by the BaseRecalibrator tool. + * Please be aware that you should only run recalibration with the covariates file created on the same input bam(s). + */ + @Input(fullName="BQSR", shortName="BQSR", required=false, doc="Input covariates table file for on-the-fly base quality score recalibration") + public File BQSR_RECAL_FILE = null; + + /** + * Turns on the base quantization module. It requires a recalibration report (-BQSR). + * + * A value of 0 here means "do not quantize". + * Any value greater than zero will be used to recalculate the quantization using that many levels. + * Negative values mean that we should quantize using the recalibration report's quantization level. + */ + @Advanced + @Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels (with -BQSR)", required=false) + public int quantizationLevels = 0; + + /** + * Turns off printing of the base insertion and base deletion tags when using the -BQSR argument. Only the base substitution qualities will be produced. + */ + @Argument(fullName="disable_indel_quals", shortName = "DIQ", doc = "Disable printing of base insertion and deletion tags (with -BQSR)", required=false) + public boolean disableIndelQuals = false; + + /** + * By default, the OQ tag in not emitted when using the -BQSR argument. Use this flag to include OQ tags in the output BAM file. + * Note that this may results in significant file size increase. + */ + @Argument(fullName="emit_original_quals", shortName = "EOQ", doc = "Emit the OQ tag with the original base qualities (with -BQSR)", required=false) + public boolean emitOriginalQuals = false; + + /** + * This flag tells GATK not to modify quality scores less than this value. Instead they will be written out unmodified in the recalibrated BAM file. + * In general it's unsafe to change qualities scores below < 6, since base callers use these values to indicate random or bad bases. + * For example, Illumina writes Q2 bases when the machine has really gone wrong. This would be fine in and of itself, + * but when you select a subset of these reads based on their ability to align to the reference and their dinucleotide effect, + * your Q2 bin can be elevated to Q8 or Q10, leading to issues downstream. + */ + @Advanced + @Argument(fullName = "preserve_qscores_less_than", shortName = "preserveQ", doc = "Don't recalibrate bases with quality scores less than this threshold (with -BQSR)", required = false, minValue = 0, minRecommendedValue = QualityUtils.MIN_USABLE_Q_SCORE) + public int PRESERVE_QSCORES_LESS_THAN = QualityUtils.MIN_USABLE_Q_SCORE; + /** + * If specified, this value will be used as the prior for all mismatch quality scores instead of the actual reported quality score. + */ + @Advanced + @Argument(fullName = "globalQScorePrior", shortName = "globalQScorePrior", doc = "Global Qscore Bayesian prior to use for BQSR", required = false) + public double globalQScorePrior = -1.0; + + + // -------------------------------------------------------------------------------------------------------------- + // + // Other utility arguments + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Keep in mind that if you set this to LENIENT, we may refuse to provide you with support if anything goes wrong. + */ + @Argument(fullName = "validation_strictness", shortName = "S", doc = "How strict should we be with validation", required = false) + public ValidationStringency strictnessLevel = ValidationStringency.SILENT; + /** + * Some tools keep program records in the SAM header by default. Use this argument to override that behavior and discard program records for the SAM header. + */ + @Argument(fullName = "remove_program_records", shortName = "rpr", doc = "Remove program records from the SAM header", required = false) + public boolean removeProgramRecords = false; + /** + * Some tools discard program records from the SAM header by default. Use this argument to override that behavior and keep program records in the SAM header. + */ + @Argument(fullName = "keep_program_records", shortName = "kpr", doc = "Keep program records in the SAM header", required = false) + public boolean keepProgramRecords = false; + + /** + * On-the-fly sample renaming works only with single-sample BAM and VCF files. Each line of the mapping file must + * contain the absolute path to a BAM or VCF file, followed by whitespace, followed by the new sample name for that + * BAM or VCF file. The sample name may contain non-tab whitespace, but leading or trailing whitespace will be + * ignored. The engine will verify at runtime that each BAM/VCF targeted for sample renaming has only a single + * sample specified in its header (though, in the case of BAM files, there may be multiple read groups for that + * sample). + */ + @Advanced + @Argument(fullName = "sample_rename_mapping_file", shortName = "sample_rename_mapping_file", doc = "Rename sample IDs on-the-fly at runtime using the provided mapping file", required = false) + public File sampleRenameMappingFile = null; + + /** + * For expert users only who know what they are doing. We do not support usage of this argument, so we may refuse to help you if you use it and something goes wrong. The one exception to this rule is ALLOW_N_CIGAR_READS, which is necessary for RNAseq analysis. + */ + @Advanced + @Argument(fullName = "unsafe", shortName = "U", doc = "Enable unsafe operations: nothing will be checked at runtime", required = false) + public ValidationExclusion.TYPE unsafe; + /** + * Not recommended for general use. Disables both auto-generation of index files and index file locking + * when reading VCFs and other rods and an index isn't present or is out-of-date. The file locking necessary for auto index + * generation to work safely is prone to random failures/hangs on certain platforms, which makes it desirable to disable it + * for situations like test suite runs where the indices are already known to exist, however this option is unsafe in general + * because it allows reading from index files without first acquiring a lock. + */ + @Advanced + @Argument(fullName = "disable_auto_index_creation_and_locking_when_reading_rods", shortName = "disable_auto_index_creation_and_locking_when_reading_rods", + doc = "Disable both auto-generation of index files and index file locking", + required = false) + public boolean disableAutoIndexCreationAndLockingWhenReadingRods = false; + + @Hidden + @Argument(fullName = "no_cmdline_in_header", shortName = "no_cmdline_in_header", doc = "Don't output the usual VCF header tag with the command line. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.", + required = false) + public boolean disableCommandLineInVCF = false; + + @Argument(fullName = "sites_only", shortName = "sites_only", doc = "Just output sites without genotypes (i.e. only the first 8 columns of the VCF)", + required = false) + public boolean sitesOnlyVCF = false; + + /** + *

    The VCF specification permits missing records to be dropped from the end of FORMAT fields, so long as GT is always output. + * This option prevents GATK from performing that trimming.

    + * + *

    For example, given a FORMAT of

    GT:AD:DP:PL
    , GATK will by default emit
    ./.
    for a variant with + * no reads present (ie, the AD, DP, and PL fields are trimmed). If you specify -writeFullFormat, this record + * would be emitted as
    ./.:.:.:.

    + */ + @Argument(fullName = "never_trim_vcf_format_field", shortName = "writeFullFormat", doc = "Always output all the records in VCF FORMAT fields, even if some are missing", + required = false) + public boolean neverTrimVCFFormatField = false; + + @Hidden + @Argument(fullName = "bcf", shortName = "bcf", doc = "Force BCF output, regardless of the file's extension", + required = false) + public boolean forceBCFOutput = false; + + @Advanced + @Argument(fullName = "bam_compression", shortName = "compress", doc = "Compression level to use for writing BAM files (0 - 9, higher is more compressed)", + minValue = 0, maxValue = 9, required = false) + public Integer bamCompression = null; + + @Advanced + @Argument(fullName = "simplifyBAM", shortName = "simplifyBAM", + doc = "If provided, output BAM files will be simplified to include just key reads for downstream variation discovery analyses (removing duplicates, PF-, non-primary reads), as well stripping all extended tags from the kept reads except the read group identifier", + required = false) + public boolean simplifyBAM = false; + + @Advanced + @Argument(fullName = "disable_bam_indexing", doc = "Turn off on-the-fly creation of indices for output BAM files.", + required = false) + public boolean disableBAMIndexing = false; + + @Argument(fullName = "generate_md5", doc = "Enable on-the-fly creation of md5s for output BAM files.", + required = false) + public boolean enableBAMmd5 = false; + + // -------------------------------------------------------------------------------------------------------------- + // + // Multi-threading arguments + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Data threads contains N cpu threads per data thread, and act as completely data parallel processing, increasing + * the memory usage of GATK by M data threads. Data threads generally scale extremely effectively, up to 24 cores. + * See online documentation FAQs for more information. + */ + @Argument(fullName = "num_threads", shortName = "nt", doc = "Number of data threads to allocate to this analysis", required = false, minValue = 1) + public Integer numberOfDataThreads = 1; + + /** + * Each CPU thread operates the map cycle independently, but may run into earlier scaling problems with IO than + * data threads. Has the benefit of not requiring X times as much memory per thread as data threads do, but rather + * only a constant overhead. See online documentation FAQs for more information. + */ + @Argument(fullName="num_cpu_threads_per_data_thread", shortName = "nct", doc="Number of CPU threads to allocate per data thread", required = false, minValue = 1) + public int numberOfCPUThreadsPerDataThread = 1; + + @Argument(fullName="num_io_threads", shortName = "nit", doc="Number of given threads to allocate to IO", required = false, minValue = 0) + @Hidden + public int numberOfIOThreads = 0; + + /** + * Enable GATK to monitor its own threading efficiency, at an itsy-bitsy tiny + * cost (< 0.1%) in runtime because of turning on the JavaBean. This is largely for + * debugging purposes. Note that this argument is not compatible with -nt, it only works with -nct. + */ + @Argument(fullName = "monitorThreadEfficiency", shortName = "mte", doc = "Enable threading efficiency monitoring", required = false) + public Boolean monitorThreadEfficiency = false; + + @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="Total number of BAM file handles to keep open simultaneously", required=false, minValue = 1) + public Integer numberOfBAMFileHandles = null; + /** + * This will filter out read groups matching : (e.g. SM:sample1) or a .txt file containing the filter strings one per line. + */ + @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Exclude read groups based on tags", required = false) + public List readGroupBlackList = null; + + // -------------------------------------------------------------------------------------------------------------- + // + // PED (pedigree) support + // + // -------------------------------------------------------------------------------------------------------------- + + /** + *

    Reads PED file-formatted tabular text files describing meta-data about the samples being + * processed in the GATK.

    + * + * + * + *

    The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory:

    + * + *
      + *
    • Family ID
    • + *
    • Individual ID
    • + *
    • Paternal ID
    • + *
    • Maternal ID
    • + *
    • Sex (1=male; 2=female; other=unknown)
    • + *
    • Phenotype
    • + *
    + * + *

    The IDs are alphanumeric: the combination of family and individual ID should uniquely identify a person. + * A PED file must have 1 and only 1 phenotype in the sixth column. The phenotype can be either a + * quantitative trait or an affection status column: GATK will automatically detect which type + * (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed).

    + * + *

    If an individual's sex is unknown, then any character other than 1 or 2 can be used.

    + * + *

    You can add a comment to a PED or MAP file by starting the line with a # character. The rest of that + * line will be ignored. Do not start any family IDs with this character therefore.

    + * + *

    Affection status should be coded:

    + * + *
      + *
    • -9 missing
    • + *
    • 0 missing
    • + *
    • 1 unaffected
    • + *
    • 2 affected
    • + *
    + * + *

    If any value outside of -9,0,1,2 is detected than the samples are assumed + * to phenotype values are interpreted as string phenotype values. In this case -9 uniquely + * represents the missing value.

    + * + *

    Genotypes (column 7 onwards) cannot be specified to the GATK.

    + * + *

    For example, here are two individuals (one row = one person):

    + * + *
    +     *   FAM001  1  0 0  1  2
    +     *   FAM001  2  0 0  1  2
    +     * 
    + * + *

    Each -ped argument can be tagged with NO_FAMILY_ID, NO_PARENTS, NO_SEX, NO_PHENOTYPE to + * tell the GATK PED parser that the corresponding fields are missing from the ped file.

    + * + *

    Note that most GATK walkers do not use pedigree information. Walkers that require pedigree + * data should clearly indicate so in their arguments and will throw errors if required pedigree + * information is missing.

    + */ + @Argument(fullName="pedigree", shortName = "ped", doc="Pedigree files for samples",required=false) + public List pedigreeFiles = Collections.emptyList(); + + /** + * Inline PED records (see -ped argument). Each -pedString STRING can contain one or more + * valid PED records (see -ped) separated by semi-colons. Supports all tags for each pedString + * as -ped supports + */ + @Argument(fullName="pedigreeString", shortName = "pedString", doc="Pedigree string for samples",required=false) + public List pedigreeStrings = Collections.emptyList(); + + /** + * How strict should we be in parsing the PED files? + */ + @Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="Validation strictness for pedigree information",required=false) + public PedigreeValidationType pedigreeValidationType = PedigreeValidationType.STRICT; + + // -------------------------------------------------------------------------------------------------------------- + // + // BAM indexing and sharding arguments + // + // -------------------------------------------------------------------------------------------------------------- + /** + * NO INTEGRATION TESTS are available. Use at your own risk. + */ + @Argument(fullName="allow_intervals_with_unindexed_bam",doc="Allow interval processing with an unsupported BAM",required=false) + @Hidden + public boolean allowIntervalsWithUnindexedBAM = false; + + // -------------------------------------------------------------------------------------------------------------- + // + // testing BCF2 + // + // -------------------------------------------------------------------------------------------------------------- + /** + * If provided, whenever we create a VCFWriter we will also write out a BCF file alongside it, for testing purposes. + */ + @Argument(fullName="generateShadowBCF",shortName = "generateShadowBCF",doc="Write a BCF copy of the output VCF",required=false) + @Hidden + public boolean generateShadowBCF = false; + // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed + + // -------------------------------------------------------------------------------------------------------------- + // + // VCF/BCF index parameters + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Specify the Tribble indexing strategy to use for VCFs. + * + * LINEAR creates a LinearIndex with bins of equal width, specified by the Bin Width parameter + * INTERVAL creates an IntervalTreeIndex with bins with an equal amount of features, specified by the Features Per Bin parameter + * DYNAMIC_SEEK attempts to optimize for minimal seek time by choosing an appropriate strategy and parameter (user-supplied parameter is ignored) + * DYNAMIC_SIZE attempts to optimize for minimal index size by choosing an appropriate strategy and parameter (user-supplied parameter is ignored) + * + * This argument is deprecated, using the output file ".g.vcf" extension will automatically set the appropriate value + */ + @Argument(fullName="variant_index_type",shortName = "variant_index_type",doc="Type of IndexCreator to use for VCF/BCF indices",required=false) + @Advanced + public GATKVCFIndexType variant_index_type = GATKVCFUtils.DEFAULT_INDEX_TYPE; + /** + * This is either the bin width or the number of features per bin, depending on the indexing strategy + * + * This argument is deprecated, using the output file ".g.vcf" extension will automatically set the appropriate value + */ + @Argument(fullName="variant_index_parameter",shortName = "variant_index_parameter",doc="Parameter to pass to the VCF/BCF IndexCreator",required=false) + @Advanced + public int variant_index_parameter = GATKVCFUtils.DEFAULT_INDEX_PARAMETER; +} + diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardVariantContextInputArgumentCollection.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardVariantContextInputArgumentCollection.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardVariantContextInputArgumentCollection.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardVariantContextInputArgumentCollection.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/crypt/CryptUtils.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/crypt/CryptUtils.java new file mode 100644 index 000000000..cbbbe47e1 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/crypt/CryptUtils.java @@ -0,0 +1,391 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.crypt; + +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.io.IOUtils; + +import javax.crypto.Cipher; +import java.io.File; +import java.io.InputStream; +import java.security.*; +import java.security.spec.InvalidKeySpecException; +import java.security.spec.KeySpec; +import java.security.spec.PKCS8EncodedKeySpec; +import java.security.spec.X509EncodedKeySpec; +import java.util.Arrays; + +/** + * A set of cryptographic utility methods and constants. + * + * Contains methods to: + * + * -Create a public/private key pair + * -Read and write public/private keys to/from files/streams + * -Load the GATK master private/public keys + * -Encrypt/decrypt data + * + * Also contains constants that control the cryptographic defaults + * throughout the GATK. + * + * @author David Roazen + */ +public class CryptUtils { + + // --------------------------------------------------------------------------------- + // Constants (these control the default cryptographic settings throughout the GATK): + // --------------------------------------------------------------------------------- + + /** + * Default key length in bits of newly-created keys. 2048 bits provides a good balance between + * security and speed. + */ + public static final int DEFAULT_KEY_LENGTH = 2048; + + /** + * Default encryption algorithm to use, when none is specified. + */ + public static final String DEFAULT_ENCRYPTION_ALGORITHM = "RSA"; + + /** + * Default random-number generation algorithm to use, when none is specified. + */ + public static final String DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM = "SHA1PRNG"; + + /** + * Name of the public key file distributed with the GATK. This file is packaged + * into the GATK jar, and we use the system ClassLoader to find it. + */ + public static final String GATK_DISTRIBUTED_PUBLIC_KEY_FILE_NAME = "GATK_public.key"; + + /** + * Location of the master copy of the GATK private key. + */ + public static final String GATK_MASTER_PRIVATE_KEY_FILE = "/humgen/gsa-hpprojects/GATK/data/gatk_master_keys/GATK_private.key"; + + /** + * Location of the master copy of the GATK public key. This file should always be the same as + * the public key file distributed with the GATK (and there are automated tests to ensure that it is). + */ + public static final String GATK_MASTER_PUBLIC_KEY_FILE = "/humgen/gsa-hpprojects/GATK/data/gatk_master_keys/GATK_public.key"; + + /** + * Directory where generated GATK user keys are stored. See the GATKKey class for more information. + */ + public static final String GATK_USER_KEY_DIRECTORY = "/humgen/gsa-hpprojects/GATK/data/gatk_user_keys/"; + + + // ----------------------- + // Utility Methods: + // ----------------------- + + /** + * Generate a new public/private key pair using the default encryption settings defined above. + * + * @return A new public/private key pair created using the default settings + */ + public static KeyPair generateKeyPair() { + return generateKeyPair(DEFAULT_KEY_LENGTH, DEFAULT_ENCRYPTION_ALGORITHM, DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM); + } + + /** + * Generate a new public/private key pair using custom encryption settings. + * + * @param keyLength Length of the key in bits + * @param encryptionAlgorithm Encryption algorithm to use + * @param randNumberAlgorithm Random-number generation algorithm to use + * @return A new public/private key pair, created according to the specified parameters + */ + public static KeyPair generateKeyPair( int keyLength, String encryptionAlgorithm, String randNumberAlgorithm ) { + try { + KeyPairGenerator keyGen = KeyPairGenerator.getInstance(encryptionAlgorithm); + SecureRandom randomnessSource = createRandomnessSource(randNumberAlgorithm); + + keyGen.initialize(keyLength, randomnessSource); + return keyGen.generateKeyPair(); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedGATKException(String.format("Could not find an implementation of the requested encryption algorithm %s", encryptionAlgorithm), e); + } + catch ( Exception e ) { + throw new ReviewedGATKException("Error while generating key pair", e); + } + } + + /** + * Create a source of randomness using the default random-number generation algorithm. + * + * @return A randomness source that uses the default algorithm + */ + public static SecureRandom createRandomnessSource() { + return createRandomnessSource(DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM); + } + + /** + * Create a source of randomness using a custom random-number generation algorithm. + * + * @param randAlgorithm The random-number generation algorithm to use + * @return A randomness sources that uses the specified algorithm + */ + public static SecureRandom createRandomnessSource ( String randAlgorithm ) { + try { + return SecureRandom.getInstance(randAlgorithm); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedGATKException(String.format("Could not find an implementation of the requested random-number generation algorithm %s", randAlgorithm), e); + } + } + + /** + * Writes a public/private key pair to disk + * + * @param keyPair The key pair we're writing to disk + * @param privateKeyFile Location to write the private key + * @param publicKeyFile Location to write the public key + */ + public static void writeKeyPair ( KeyPair keyPair, File privateKeyFile, File publicKeyFile ) { + writeKey(keyPair.getPrivate(), privateKeyFile); + writeKey(keyPair.getPublic(), publicKeyFile); + } + + /** + * Writes an arbitrary key to disk + * + * @param key The key to write + * @param destination Location to write the key to + */ + public static void writeKey ( Key key, File destination ) { + IOUtils.writeByteArrayToFile(key.getEncoded(), destination); + } + + /** + * Reads in a public key created using the default encryption algorithm from a file. + * + * @param source File containing the public key + * @return The public key read + */ + public static PublicKey readPublicKey ( File source ) { + return decodePublicKey(IOUtils.readFileIntoByteArray(source), DEFAULT_ENCRYPTION_ALGORITHM); + } + + /** + * Reads in a public key created using the default encryption algorithm from a stream. + * + * @param source Stream attached to the public key + * @return The public key read + */ + public static PublicKey readPublicKey ( InputStream source ) { + return decodePublicKey(IOUtils.readStreamIntoByteArray(source), DEFAULT_ENCRYPTION_ALGORITHM); + } + + /** + * Decodes the raw bytes of a public key into a usable object. + * + * @param rawKey The encoded bytes of a public key as read from, eg., a file. The + * key must be in the standard X.509 format for a public key. + * @param encryptionAlgorithm The encryption algorithm used to create the public key + * @return The public key as a usable object + */ + public static PublicKey decodePublicKey ( byte[] rawKey, String encryptionAlgorithm ) { + try { + KeySpec keySpec = new X509EncodedKeySpec(rawKey); + KeyFactory keyFactory = KeyFactory.getInstance(encryptionAlgorithm); + return keyFactory.generatePublic(keySpec); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedGATKException(String.format("Could not find an implementation of the requested encryption algorithm %s", encryptionAlgorithm), e); + } + catch ( InvalidKeySpecException e ) { + throw new ReviewedGATKException("Unable to use X.509 key specification to decode the given key", e); + } + } + + /** + * Reads in a private key created using the default encryption algorithm from a file. + * + * @param source File containing the private key + * @return The private key read + */ + public static PrivateKey readPrivateKey ( File source ) { + return decodePrivateKey(IOUtils.readFileIntoByteArray(source), DEFAULT_ENCRYPTION_ALGORITHM); + } + + /** + * Reads in a private key created using the default encryption algorithm from a stream. + * + * @param source Stream attached to the private key + * @return The private key read + */ + public static PrivateKey readPrivateKey ( InputStream source ) { + return decodePrivateKey(IOUtils.readStreamIntoByteArray(source), DEFAULT_ENCRYPTION_ALGORITHM); + } + + /** + * Decodes the raw bytes of a private key into a usable object. + * + * @param rawKey The encoded bytes of a private key as read from, eg., a file. The + * key must be in the standard PKCS #8 format for a private key. + * @param encryptionAlgorithm The encryption algorithm used to create the private key + * @return The private key as a usable object + */ + public static PrivateKey decodePrivateKey ( byte[] rawKey, String encryptionAlgorithm ) { + try { + KeySpec keySpec = new PKCS8EncodedKeySpec(rawKey); + KeyFactory keyFactory = KeyFactory.getInstance(encryptionAlgorithm); + return keyFactory.generatePrivate(keySpec); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedGATKException(String.format("Could not find an implementation of the requested encryption algorithm %s", encryptionAlgorithm), e); + } + catch ( InvalidKeySpecException e ) { + throw new ReviewedGATKException("Unable to use the PKCS #8 key specification to decode the given key", e); + } + } + + /** + * Loads the copy of the GATK public key that is distributed with the GATK. Uses the system + * ClassLoader to locate the public key file, which should be stored at the root of the GATK + * jar file. + * + * @return The GATK public key as a usable object + */ + public static PublicKey loadGATKDistributedPublicKey() { + InputStream publicKeyInputStream = ClassLoader.getSystemResourceAsStream(GATK_DISTRIBUTED_PUBLIC_KEY_FILE_NAME); + + if ( publicKeyInputStream == null ) { + throw new ReviewedGATKException(String.format("Could not locate the GATK public key %s in the classpath", + GATK_DISTRIBUTED_PUBLIC_KEY_FILE_NAME)); + } + + return readPublicKey(publicKeyInputStream); + } + + /** + * Loads the master copy of the GATK private key. You must have the appropriate UNIX permissions + * to do this! + * + * @return The GATK master private key as a usable object + */ + public static PrivateKey loadGATKMasterPrivateKey() { + return readPrivateKey(new File(GATK_MASTER_PRIVATE_KEY_FILE)); + } + + /** + * Loads the master copy of the GATK public key. This should always be the same as the + * public key distributed with the GATK returned by loadGATKDistributedPublicKey(). + * + * @return The GATK master public key as a usable object + */ + public static PublicKey loadGATKMasterPublicKey() { + return readPublicKey(new File(GATK_MASTER_PUBLIC_KEY_FILE)); + } + + /** + * Encrypts the given data using the key provided. + * + * @param data The data to encrypt, as a byte array + * @param encryptKey The key with which to encrypt the data + * @return The encrypted version of the provided data + */ + public static byte[] encryptData ( byte[] data, Key encryptKey ) { + return transformDataUsingCipher(data, encryptKey, Cipher.ENCRYPT_MODE); + } + + /** + * Decrypts the given data using the key provided. + * + * @param encryptedData Data to decrypt, as a byte array + * @param decryptKey The key with which to decrypt the data + * @return The decrypted version of the provided data + */ + public static byte[] decryptData ( byte[] encryptedData, Key decryptKey ) { + return transformDataUsingCipher(encryptedData, decryptKey, Cipher.DECRYPT_MODE); + } + + /** + * Helper method for encryption/decryption that takes data and processes it using + * the given key + * + * @param data Data to encrypt/decrypt + * @param key Key to use to encrypt/decrypt the data + * @param cipherMode Specifies whether we are encrypting or decrypting + * @return The encrypted/decrypted data + */ + private static byte[] transformDataUsingCipher ( byte[] data, Key key, int cipherMode ) { + try { + Cipher cipher = Cipher.getInstance(key.getAlgorithm()); + cipher.init(cipherMode, key); + return cipher.doFinal(data); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedGATKException(String.format("Could not find an implementation of the requested algorithm %s", + key.getAlgorithm()), e); + } + catch ( InvalidKeyException e ) { + throw new ReviewedGATKException("Key is invalid", e); + } + catch ( GeneralSecurityException e ) { + throw new ReviewedGATKException("Error during encryption", e); + } + } + + /** + * Tests whether the public/private keys provided can each decrypt data encrypted by + * the other key -- ie., tests whether these two keys are part of the same public/private + * key pair. + * + * @param privateKey The private key to test + * @param publicKey The public key to test + * @return True if the keys are part of the same key pair and can decrypt each other's + * encrypted data, otherwise false. + */ + public static boolean keysDecryptEachOther ( PrivateKey privateKey, PublicKey publicKey ) { + byte[] plainText = "Test PlainText".getBytes(); + + byte[] dataEncryptedUsingPrivateKey = CryptUtils.encryptData(plainText, privateKey); + byte[] dataEncryptedUsingPublicKey = CryptUtils.encryptData(plainText, publicKey); + + byte[] privateKeyDataDecryptedWithPublicKey = CryptUtils.decryptData(dataEncryptedUsingPrivateKey, publicKey); + byte[] publicKeyDataDecryptedWithPrivateKey = CryptUtils.decryptData(dataEncryptedUsingPublicKey, privateKey); + + // Make sure we actually transformed the data during encryption: + if ( Arrays.equals(plainText, dataEncryptedUsingPrivateKey) || + Arrays.equals(plainText, dataEncryptedUsingPublicKey) || + Arrays.equals(dataEncryptedUsingPrivateKey, dataEncryptedUsingPublicKey) ) { + return false; + } + + // Make sure that we were able to recreate the original plaintext using + // both the public key on the private-key-encrypted data and the private + // key on the public-key-encrypted data: + if ( ! Arrays.equals(plainText, privateKeyDataDecryptedWithPublicKey) || + ! Arrays.equals(plainText, publicKeyDataDecryptedWithPrivateKey) ) { + return false; + } + + return true; + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/crypt/GATKKey.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/crypt/GATKKey.java new file mode 100644 index 000000000..42a88b9d0 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/crypt/GATKKey.java @@ -0,0 +1,350 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.crypt; + +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.io.IOUtils; + +import java.io.*; +import java.security.*; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + +/** + * Class to represent a GATK user key. + * + * A GATK user key contains an email address and a cryptographic signature. + * The signature is the SHA-1 hash of the email address encrypted using + * the GATK master private key. The GATK master public key (distributed + * with the GATK) is used to decrypt the signature and validate the key + * at the start of each GATK run that requires a key. + * + * Keys are cryptographically secure in that valid keys definitely come + * from us and cannot be fabricated, however nothing prevents keys from + * being shared between users. + * + * GATK user keys have the following on-disk format: + * + * GZIP Container: + * Email address + * NUL byte (delimiter) + * Cryptographic Signature (encrypted SHA-1 hash of email address) + * + * The key data is wrapped within a GZIP container to placate over-zealous + * email filters (since keys must often be emailed) and also to provide an + * additional integrity check via the built-in GZIP CRC. + * + * @author David Roazen + */ +public class GATKKey { + + /** + * Private key used to sign the GATK key. Required only when creating a new + * key from scratch, not when loading an existing key from disk. + */ + private PrivateKey privateKey; + + /** + * Public key used to validate the GATK key. + */ + private PublicKey publicKey; + + /** + * The user's email address, stored within the key and signed. + */ + private String emailAddress; + + /** + * The cryptographic signature of the email address. By default, this is + * the SHA-1 hash of the email address encrypted using the RSA algorithm. + */ + private byte[] signature; + + /** + * The combination of hash/encryption algorithms to use to generate the signature. + * By default this is "SHA1withRSA" + */ + private String signingAlgorithm; + + /** + * Default hash/encryption algorithms to use to sign the key. + */ + public static final String DEFAULT_SIGNING_ALGORITHM = "SHA1withRSA"; + + /** + * Byte value used to separate the email address from its signature in the key file. + */ + public static final byte GATK_KEY_SECTIONAL_DELIMITER = 0; + + + // ----------------------- + // Constructors: + // ----------------------- + + /** + * Constructor to create a new GATK key from scratch using an email address + * and public/private key pair. The private key is used for signing, and the + * public key is used to validate the newly-created key. + * + * @param privateKey Private key used to sign the new GATK key + * @param publicKey Public key used to validate the new GATK key + * @param emailAddress The user's email address, which we will store in the key and sign + */ + public GATKKey ( PrivateKey privateKey, PublicKey publicKey, String emailAddress ) { + this(privateKey, publicKey, emailAddress, DEFAULT_SIGNING_ALGORITHM); + } + + /** + * Constructor to create a new GATK key from scratch using an email address + * and public/private key pair, and additionally specify the signing algorithm + * to use. The private key is used for signing, and the public key is used to + * validate the newly-created key. + * + * @param privateKey Private key used to sign the new GATK key + * @param publicKey Public key used to validate the new GATK key + * @param emailAddress The user's email address, which we will store in the key and sign + * @param signingAlgorithm The combination of hash and encryption algorithms to use to sign the key + */ + public GATKKey ( PrivateKey privateKey, PublicKey publicKey, String emailAddress, String signingAlgorithm ) { + if ( privateKey == null || publicKey == null || emailAddress == null || emailAddress.length() == 0 || signingAlgorithm == null ) { + throw new ReviewedGATKException("Cannot construct GATKKey using null/empty arguments"); + } + + this.privateKey = privateKey; + this.publicKey = publicKey; + this.emailAddress = emailAddress; + this.signingAlgorithm = signingAlgorithm; + + validateEmailAddress(); + generateSignature(); + + if ( ! isValid() ) { + throw new ReviewedGATKException("Newly-generated GATK key fails validation -- this should never happen!"); + } + } + + /** + * Constructor to load an existing GATK key from a file. + * + * During loading, the key file is checked for integrity, but not cryptographic + * validity (which must be done through a subsequent call to isValid()). + * + * @param publicKey Public key that will be used to validate the loaded GATK key + * in subsequent calls to isValid() + * @param keyFile File containing the GATK key to load + */ + public GATKKey ( PublicKey publicKey, File keyFile ) { + this(publicKey, keyFile, DEFAULT_SIGNING_ALGORITHM); + } + + /** + * Constructor to load an existing GATK key from a file, and additionally specify + * the signing algorithm used to sign the key being loaded. + * + * During loading, the key file is checked for integrity, but not cryptographic + * validity (which must be done through a subsequent call to isValid()). + * + * @param publicKey Public key that will be used to validate the loaded GATK key + * in subsequent calls to isValid() + * @param keyFile File containing the GATK key to load + * @param signingAlgorithm The combination of hash and encryption algorithms used to sign the key + */ + public GATKKey ( PublicKey publicKey, File keyFile, String signingAlgorithm ) { + if ( publicKey == null || keyFile == null || signingAlgorithm == null ) { + throw new ReviewedGATKException("Cannot construct GATKKey using null arguments"); + } + + this.publicKey = publicKey; + this.signingAlgorithm = signingAlgorithm; + + readKey(keyFile); + } + + // ----------------------- + // Public API Methods: + // ----------------------- + + /** + * Writes out this key to a file in the format described at the top of this class, + * encapsulating the key within a GZIP container. + * + * @param destination File to write the key to + */ + public void writeKey ( File destination ) { + try { + byte[] keyBytes = marshalKeyData(); + IOUtils.writeByteArrayToStream(keyBytes, new GZIPOutputStream(new FileOutputStream(destination))); + } + catch ( IOException e ) { + throw new UserException.CouldNotCreateOutputFile(destination, e); + } + } + + /** + * Checks whether the signature of this key is cryptographically valid (ie., can be + * decrypted by the public key to produce a valid SHA-1 hash of the email address + * in the key). + * + * @return True if the key's signature passes validation, otherwise false + */ + public boolean isValid() { + try { + Signature sig = Signature.getInstance(signingAlgorithm); + sig.initVerify(publicKey); + sig.update(emailAddress.getBytes()); + return sig.verify(signature); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedGATKException(String.format("Signing algorithm %s not found", signingAlgorithm), e); + } + catch ( InvalidKeyException e ) { + // If the GATK public key is invalid, it's likely our problem, not the user's: + throw new ReviewedGATKException(String.format("Public key %s is invalid", publicKey), e); + } + catch ( SignatureException e ) { + throw new UserException.UnreadableKeyException("Signature is invalid or signing algorithm was unable to process the input data", e); + } + } + + // ----------------------- + // Private Helper Methods: + // ----------------------- + + /** + * Helper method that creates a signature for this key using the combination of + * hash/encryption algorithms specified at construction time. + */ + private void generateSignature() { + try { + Signature sig = Signature.getInstance(signingAlgorithm); + sig.initSign(privateKey, CryptUtils.createRandomnessSource()); + sig.update(emailAddress.getBytes()); + signature = sig.sign(); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedGATKException(String.format("Signing algorithm %s not found", signingAlgorithm), e); + } + catch ( InvalidKeyException e ) { + throw new ReviewedGATKException(String.format("Private key %s is invalid", privateKey), e); + } + catch ( SignatureException e ) { + throw new ReviewedGATKException(String.format("Error creating signature for email address %s", emailAddress), e); + } + } + + /** + * Helper method that reads in a GATK key from a file. Should not be called directly -- + * use the appropriate constructor above. + * + * @param source File to read the key from + */ + private void readKey ( File source ) { + try { + byte[] keyBytes = IOUtils.readStreamIntoByteArray(new GZIPInputStream(new FileInputStream(source))); + + // As a sanity check, compare the number of bytes read to the uncompressed file size + // stored in the GZIP ISIZE field. If they don't match, the key must be corrupt: + if ( keyBytes.length != IOUtils.getGZIPFileUncompressedSize(source) ) { + throw new UserException.UnreadableKeyException("Number of bytes read does not match the uncompressed size specified in the GZIP ISIZE field"); + } + + unmarshalKeyData(keyBytes); + } + catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(source, e); + } + catch ( IOException e ) { + throw new UserException.UnreadableKeyException(source, e); + } + catch ( UserException.CouldNotReadInputFile e ) { + throw new UserException.UnreadableKeyException(source, e); + } + } + + /** + * Helper method that assembles the email address and signature into a format + * suitable for writing to disk. + * + * @return The aggregated key data, ready to be written to disk + */ + private byte[] marshalKeyData() { + byte[] emailAddressBytes = emailAddress.getBytes(); + byte[] assembledKey = new byte[emailAddressBytes.length + 1 + signature.length]; + + System.arraycopy(emailAddressBytes, 0, assembledKey, 0, emailAddressBytes.length); + assembledKey[emailAddressBytes.length] = GATK_KEY_SECTIONAL_DELIMITER; + System.arraycopy(signature, 0, assembledKey, emailAddressBytes.length + 1, signature.length); + + return assembledKey; + } + + /** + * Helper method that parses the raw key data from disk into its component + * email address and signature. Performs some basic validation in the process. + * + * @param keyBytes The raw, uncompressed key data read from disk + */ + private void unmarshalKeyData ( byte[] keyBytes ) { + int delimiterPosition = -1; + + for ( int i = 0; i < keyBytes.length; i++ ) { + if ( keyBytes[i] == GATK_KEY_SECTIONAL_DELIMITER ) { + delimiterPosition = i; + break; + } + } + + if ( delimiterPosition == -1 ) { + throw new UserException.UnreadableKeyException("Malformed GATK key contains no sectional delimiter"); + } + else if ( delimiterPosition == 0 ) { + throw new UserException.UnreadableKeyException("Malformed GATK key contains no email address"); + } + else if ( delimiterPosition == keyBytes.length - 1 ) { + throw new UserException.UnreadableKeyException("Malformed GATK key contains no signature"); + } + + byte[] emailAddressBytes = new byte[delimiterPosition]; + System.arraycopy(keyBytes, 0, emailAddressBytes, 0, delimiterPosition); + emailAddress = new String(emailAddressBytes); + + signature = new byte[keyBytes.length - delimiterPosition - 1]; + System.arraycopy(keyBytes, delimiterPosition + 1, signature, 0, keyBytes.length - delimiterPosition - 1); + } + + /** + * Helper method that ensures that the user's email address does not contain the NUL byte, which we + * reserve as a delimiter within each key file. + */ + private void validateEmailAddress() { + for ( byte b : emailAddress.getBytes() ) { + if ( b == GATK_KEY_SECTIONAL_DELIMITER ) { + throw new UserException(String.format("Email address must not contain a byte with value %d", GATK_KEY_SECTIONAL_DELIMITER)); + } + } + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/package-info.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/package-info.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/package-info.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/AllLocusView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/AllLocusView.java new file mode 100644 index 000000000..4bcecbcad --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/AllLocusView.java @@ -0,0 +1,169 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.engine.iterators.GenomeLocusIterator; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.util.Collections; +import java.util.List; +import java.util.NoSuchElementException; +/** + * User: hanna + * Date: May 13, 2009 + * Time: 3:32:30 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * A LocusView over which the user can iterate. + */ + +public class AllLocusView extends LocusView { + private GenomeLocusIterator locusIterator; + + /** + * Gets the next position in the view: next call to next() will jump there. + * Note that both nextPosition and nextLocus are PRE-read and cached. + */ + private GenomeLoc nextPosition = null; + + /** + * What's the next available context? + */ + private AlignmentContext nextLocus = null; + + /** + * Signal not to advance the iterator because we're currently sitting at the next element. + */ + private boolean atNextElement = false; + + /** + * Create a new queue of locus contexts. + * + * @param provider + */ + public AllLocusView(LocusShardDataProvider provider) { + super(provider); + // Seed the state tracking members with the first possible seek position and the first possible locus context. + locusIterator = new GenomeLocusIterator(genomeLocParser, provider.getLocus()); + } + + public boolean hasNext() { + advance(); + return nextPosition != null; + } + + public AlignmentContext next() { + advance(); + + if (nextPosition == null) + throw new NoSuchElementException("No next is available in the all locus view"); + + // Flag to the iterator that no data is waiting in the queue to be processed. + atNextElement = false; + + AlignmentContext currentLocus; + + // If actual data is present, return it. Otherwise, return empty data. + if (nextLocus != null && nextLocus.getLocation().equals(nextPosition)) + currentLocus = nextLocus; + else + currentLocus = createEmptyLocus(nextPosition); + + return currentLocus; + } + + private void advance() { + // Already at the next element? Don't move forward. + if (atNextElement) + return; + + // Out of elements? + if (nextPosition == null && !locusIterator.hasNext()) + return; + + // If nextLocus has been consumed, clear it out to make room for the next incoming locus. + if (nextPosition != null && nextLocus != null && !nextLocus.getLocation().isPast(nextPosition)) { + nextLocus = null; + + // Determine the next locus. The trick is that we may have more than one alignment context at the same + // reference position (regular base pileup, then extended pileup). If next alignment context (that we just pre-read) + // is still at the current position, we do not increment current position and wait for next call to next() to return + // that context. If we know that next context is past the current position, we are done with current + // position + if (hasNextLocus()) { + nextLocus = nextLocus(); + if (nextPosition.equals(nextLocus.getLocation())) { + atNextElement = true; + return; + } + } + } + + // No elements left in queue? Clear out the position state tracker and return. + if (!locusIterator.hasNext()) { + nextPosition = null; + return; + } + + // Actually fill the next position. + nextPosition = locusIterator.next(); + atNextElement = true; + + // Crank the iterator to (if possible) or past the next context. Be careful not to hold a reference to nextLocus + // while using the hasNextLocus() / nextLocus() machinery; this will cause us to use more memory than is optimal. + while (nextLocus == null || nextLocus.getLocation().isBefore(nextPosition)) { + nextLocus = null; + if (!hasNextLocus()) + break; + nextLocus = nextLocus(); + } + } + + /** + * Creates a blank locus context at the specified location. + * + * @param site Site at which to create the blank locus context. + * @return empty context. + */ + private final static List EMPTY_PILEUP_READS = Collections.emptyList(); + private final static List EMPTY_PILEUP_OFFSETS = Collections.emptyList(); + private final static List EMPTY_DELETION_STATUS = Collections.emptyList(); + + private AlignmentContext createEmptyLocus(GenomeLoc site) { + return new AlignmentContext(site, new ReadBackedPileupImpl(site, EMPTY_PILEUP_READS, EMPTY_PILEUP_OFFSETS)); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/CoveredLocusView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/CoveredLocusView.java new file mode 100644 index 000000000..777e23cb8 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/CoveredLocusView.java @@ -0,0 +1,63 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +/** + * User: hanna + * Date: May 12, 2009 + * Time: 11:24:42 AM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * A queue of locus contexts. Provides unidirectional seek. Stripped down + * implementation of java.util.Queue interface. + */ + +public class CoveredLocusView extends LocusView { + /** + * Create a new queue of locus contexts. + * @param provider + */ + public CoveredLocusView(LocusShardDataProvider provider) { + super(provider); + } + + public boolean hasNext() { + return hasNextLocus(); + } + + public AlignmentContext next() { + return nextLocus(); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalOverlappingRODsFromStream.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalOverlappingRODsFromStream.java new file mode 100644 index 000000000..1525c381a --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalOverlappingRODsFromStream.java @@ -0,0 +1,168 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import htsjdk.samtools.util.PeekableIterator; +import org.broadinstitute.gatk.utils.refdata.RODRecordListImpl; +import org.broadinstitute.gatk.utils.refdata.utils.GATKFeature; +import org.broadinstitute.gatk.utils.refdata.utils.RODRecordList; +import org.broadinstitute.gatk.utils.GenomeLoc; + +import java.util.Collection; +import java.util.LinkedList; +import java.util.ListIterator; + +/** + * Key algorithmic helper for ReadBasedReferenceOrderedData + * + * Takes a single iterator of features, and provides a single capability that returns + * the list of RODs that overlap an interval. Allows sequential getOverlapping calls + * from intervals provided that these intervals always have increasing getStart() values. + * + */ +class IntervalOverlappingRODsFromStream { + /** + * Only held for QC purposes + */ + GenomeLoc lastQuery = null; + + private final String name; + private final LinkedList currentFeatures = new LinkedList(); + private final PeekableIterator futureFeatures; + + /** + * Create a new IntervalOverlappingRODsFromStream that reads elements from futureFeatures and + * returns RODRecordLists having name + * + * @param name + * @param futureFeatures + */ + IntervalOverlappingRODsFromStream(final String name, final PeekableIterator futureFeatures) { + if ( futureFeatures == null ) throw new IllegalArgumentException("futureFeatures cannot be null"); + + this.name = name; + this.futureFeatures = futureFeatures; + } + + /** + * Get the list of RODs overlapping loc from this stream of RODs. + * + * @param loc the interval to query + * @return a non-null RODRecordList containing the overlapping RODs, which may be empty + */ + @Ensures({"overlaps(loc, result)", + "! futureFeatures.hasNext() || futureFeatures.peek().getLocation().isPast(loc)", + "result != null"}) + public RODRecordList getOverlapping(final GenomeLoc loc) { + if ( lastQuery != null && loc.getStart() < lastQuery.getStart() ) + throw new IllegalArgumentException(String.format("BUG: query interval (%s) starts before the previous interval %s", loc, lastQuery)); + + readOverlappingFutureFeatures(loc); + return new RODRecordListImpl(name, subsetToOverlapping(loc, currentFeatures), loc); + } + + + /** + * For contract assurance. Checks that all bindings in loc overlap + * + * @param loc + * @param bindings + * @return + */ + @Requires({"loc != null", "bindings != null"}) + private boolean overlaps(final GenomeLoc loc, final RODRecordList bindings) { + for ( final GATKFeature feature : bindings ) + if ( ! feature.getLocation().overlapsP(loc) ) + return false; + return true; + } + + /** + * Subset the features in all to those that overlap with loc + * + * The current features list contains everything read that cannot be thrown away yet, but not + * everything in there necessarily overlaps with loc. Subset to just those that do overlap + * + * @param loc the location that features must overlap + * @param all the list of all features + * @return a subset of all that overlaps with loc + */ + @Requires({"loc != null", "all != null"}) + @Ensures("result.size() <= all.size()") + private Collection subsetToOverlapping(final GenomeLoc loc, final Collection all) { + final LinkedList overlapping = new LinkedList(); + for ( final GATKFeature feature : all ) + if ( feature.getLocation().overlapsP(loc) ) + overlapping.add(feature); + return overlapping; + } + + /** + * Update function. Remove all elements of currentFeatures that end before loc + * + * Must be called by clients periodically when they know they they will never ask for data before + * loc, so that the running cache of RODs doesn't grow out of control. + * + * @param loc the location to use + */ + @Requires("loc != null") + @Ensures("currentFeatures.size() <= old(currentFeatures.size())") + public void trimCurrentFeaturesToLoc(final GenomeLoc loc) { + final ListIterator it = currentFeatures.listIterator(); + while ( it.hasNext() ) { + final GATKFeature feature = it.next(); + if ( feature.getLocation().isBefore(loc) ) + it.remove(); + } + } + + /** + * Update function: Read all elements from futureFeatures that overlap with loc + * + * Stops at the first element that starts before the end of loc, or the stream empties + * + * @param loc + */ + @Requires("loc != null") + @Ensures("currentFeatures.size() >= old(currentFeatures.size())") + private void readOverlappingFutureFeatures(final GenomeLoc loc) { + while ( futureFeatures.hasNext() ) { + final GenomeLoc nextLoc = futureFeatures.peek().getLocation(); + if ( nextLoc.isBefore(loc) ) { + futureFeatures.next(); // next rod element is before loc, throw it away and keep looking + } else if ( nextLoc.isPast(loc) ) { + break; // next element is past loc, stop looking but don't pop it + } else if ( nextLoc.overlapsP(loc) ) { + // add overlapping elements to our current features, removing from stream + for ( final GATKFeature feature : futureFeatures.next() ) { + currentFeatures.add(feature); + } + } + } + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalReferenceOrderedView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalReferenceOrderedView.java new file mode 100644 index 000000000..4dfc31d86 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalReferenceOrderedView.java @@ -0,0 +1,182 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import htsjdk.samtools.util.PeekableIterator; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.utils.LocationAwareSeekableRODIterator; +import org.broadinstitute.gatk.utils.refdata.utils.RODRecordList; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +/** + * a ROD view that allows for requests for RODs that overlap intervals on the genome to produce a RefMetaDataTracker + */ +public class IntervalReferenceOrderedView implements ReferenceOrderedView { + /** a list of the RMDDataState (location->iterators) */ + private final List states = new ArrayList<>(1); + + /** + * Used to get genome locs for reads + */ + protected final GenomeLocParser genomeLocParser; + + /** + * The total extent of all reads in this span. We create iterators from our RODs + * from the start of this span, to the end. + */ + private final GenomeLoc shardSpan; + + /** + * Create a new IntervalReferenceOrderedView taking data from provider and capable of + * servicing ROD overlap requests within the genomic interval span + * + * @param provider a ShardDataProvider to give us data + * @param span a GenomeLoc span, or null indicating take the entire genome + */ + public IntervalReferenceOrderedView(final ShardDataProvider provider, final GenomeLoc span) { + if ( provider == null ) throw new IllegalArgumentException("provider cannot be null"); + if ( provider.hasReferenceOrderedData() && span == null ) throw new IllegalArgumentException("span cannot be null when provider has reference ordered data"); + + this.genomeLocParser = provider.getGenomeLocParser(); + this.shardSpan = span; + provider.register(this); + + // conditional to optimize the case where we don't have any ROD data + if ( provider.hasReferenceOrderedData() && ! shardSpan.isUnmapped() ) { + for (final ReferenceOrderedDataSource dataSource : provider.getReferenceOrderedData()) + states.add(new RMDDataState(dataSource, dataSource.seek(shardSpan))); + } + } + + /** + * Testing constructor + */ + protected IntervalReferenceOrderedView(final GenomeLocParser genomeLocParser, + final GenomeLoc shardSpan, + final List names, + final List> featureSources) { + this.genomeLocParser = genomeLocParser; + this.shardSpan = shardSpan; + for ( int i = 0; i < names.size(); i++ ) + states.add(new RMDDataState(names.get(i), featureSources.get(i))); + } + + public Collection> getConflictingViews() { + List> classes = new ArrayList<>(); + classes.add(ManagingReferenceOrderedView.class); + return classes; + } + + /** + * Get a RefMetaDataTracker containing bindings for all RODs overlapping the start position of loc + * @param loc a GenomeLoc of size == 1 + * @return a non-null RefMetaDataTracker + */ + @Override + public RefMetaDataTracker getReferenceOrderedDataAtLocus(GenomeLoc loc) { + if ( loc == null ) throw new IllegalArgumentException("loc cannot be null"); + if ( loc.size() != 1 ) throw new IllegalArgumentException("GenomeLoc must have size == 1 but got " + loc); + return getReferenceOrderedDataForInterval(loc); + } + + /** + * Get a RefMetaDataTracker containing bindings for all RODs overlapping interval + * + * @param interval a non=null interval + * @return a non-null RefMetaDataTracker + */ + public RefMetaDataTracker getReferenceOrderedDataForInterval(final GenomeLoc interval) { + if ( interval == null ) throw new IllegalArgumentException("Interval cannot be null"); + + if ( states.isEmpty() || shardSpan.isUnmapped() ) // optimization for no bindings (common for read walkers) + return RefMetaDataTracker.EMPTY_TRACKER; + else { + final List bindings = new ArrayList<>(states.size()); + for ( final RMDDataState state : states ) + bindings.add(state.stream.getOverlapping(interval)); + return new RefMetaDataTracker(bindings); + } + } + + /** + * Trim down all of the ROD managers so that they only hold ROD bindings wit start >= startOfDataToKeep.getStart() + * + * @param startOfDataToKeep a non-null genome loc + */ + public void trimCurrentFeaturesToLoc(final GenomeLoc startOfDataToKeep) { + if ( startOfDataToKeep == null ) throw new IllegalArgumentException("startOfDataToKeep cannot be null"); + + for ( final RMDDataState state : states ) + state.stream.trimCurrentFeaturesToLoc(startOfDataToKeep); + } + + /** + * Closes the current view. + */ + public void close() { + for (final RMDDataState state : states) + state.close(); + + // Clear out the existing data so that post-close() accesses to this data will fail-fast. + states.clear(); + } + + /** + * Models the traversal state of a given ROD lane. + */ + private static class RMDDataState { + public final ReferenceOrderedDataSource dataSource; + public final IntervalOverlappingRODsFromStream stream; + private final LocationAwareSeekableRODIterator iterator; + + public RMDDataState(ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator) { + this.dataSource = dataSource; + this.iterator = iterator; + this.stream = new IntervalOverlappingRODsFromStream(dataSource.getName(), new PeekableIterator<>(iterator)); + } + + /** + * For testing + */ + public RMDDataState(final String name, final PeekableIterator iterator) { + this.dataSource = null; + this.iterator = null; + this.stream = new IntervalOverlappingRODsFromStream(name, new PeekableIterator<>(iterator)); + } + + public void close() { + if ( dataSource != null ) + dataSource.close( iterator ); + } + } +} + diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/InvalidPositionException.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/InvalidPositionException.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/InvalidPositionException.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/InvalidPositionException.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceView.java new file mode 100644 index 000000000..d4278c9b2 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceView.java @@ -0,0 +1,236 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import htsjdk.samtools.reference.ReferenceSequence; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.engine.walkers.Reference; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.engine.walkers.Window; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Provides access to the portion of the reference covering a single locus. + */ +public class LocusReferenceView extends ReferenceView { + /** + * Bound the reference view to make sure all accesses are within the shard. + */ + private GenomeLoc bounds; + + /** + * Start of the expanded window for which the reference context should be provided, + * relative to the locus in question. + */ + private final int windowStart; + + + /** + * Start of the expanded window for which the reference context should be provided, + * relative to the locus in question. + */ + private final int windowStop; + + /** + * Track the reference sequence and the last point accessed. Used to + * track state when traversing over the reference. + */ + private ReferenceSequence referenceSequence; + + /** + * Create a LocusReferenceView given no other contextual information about + * the walkers, etc. + * @param provider source for locus data. + */ + public LocusReferenceView( LocusShardDataProvider provider ) { + super(provider); + initializeBounds(provider); + windowStart = windowStop = 0; + initializeReferenceSequence(bounds); + } + + /** + * Create a new locus reference view. + * @param provider source for locus data. + */ + public LocusReferenceView( Walker walker, LocusShardDataProvider provider ) { + super( provider ); + initializeBounds(provider); + + // Retrieve information about the window being accessed. + if( walker.getClass().isAnnotationPresent(Reference.class) ) { + Window window = walker.getClass().getAnnotation(Reference.class).window(); + + if( window.start() > 0 ) throw new ReviewedGATKException( "Reference window starts after current locus" ); + if( window.stop() < 0 ) throw new ReviewedGATKException( "Reference window ends before current locus" ); + + windowStart = window.start(); + windowStop = window.stop(); + } + else { + windowStart = 0; + windowStop = 0; + } + + if(bounds != null) { + int expandedStart = getWindowStart( bounds ); + int expandedStop = getWindowStop( bounds ); + initializeReferenceSequence(genomeLocParser.createGenomeLoc(bounds.getContig(), bounds.getContigIndex(), expandedStart, expandedStop)); + } + } + + /** + * Initialize the bounds of this shard, trimming the bounds so that they match the reference. + * @param provider Provider covering the appropriate locus. + */ + private void initializeBounds(LocusShardDataProvider provider) { + if(provider.getLocus() != null) { + int sequenceLength = reference.getSequenceDictionary().getSequence(provider.getLocus().getContig()).getSequenceLength(); + bounds = genomeLocParser.createGenomeLoc(provider.getLocus().getContig(), + Math.max(provider.getLocus().getStart(),1), + Math.min(provider.getLocus().getStop(),sequenceLength)); + } + else + bounds = null; + } + + /** + * Initialize reference sequence data using the given locus. + * @param locus + */ + private void initializeReferenceSequence( GenomeLoc locus ) { + this.referenceSequence = reference.getSubsequenceAt( locus.getContig(), locus.getStart(), locus.getStop() ); + } + + protected GenomeLoc trimToBounds(GenomeLoc l) { + int expandedStart = getWindowStart( bounds ); + int expandedStop = getWindowStop( bounds ); + if ( l.getStart() < expandedStart ) l = genomeLocParser.setStart(l, expandedStart); + if ( l.getStop() > expandedStop ) l = genomeLocParser.setStop(l, expandedStop); + return l; + } + + public class Provider implements ReferenceContext.ReferenceContextRefProvider { + int refStart, len; + + public Provider( int refStart, int len ) { + this.refStart = refStart; + this.len = len; + } + + public byte[] getBases() { + //System.out.printf("Getting bases for location%n"); + byte[] bases = new byte[len]; + System.arraycopy(referenceSequence.getBases(), refStart, bases, 0, len); + return bases; + } + } + + /** + * Gets the reference context associated with this particular point or extended interval on the genome. + * @param genomeLoc Region for which to retrieve the base(s). If region spans beyond contig end or beyond current bounds, it will be trimmed down. + * @return The base at the position represented by this genomeLoc. + */ + public ReferenceContext getReferenceContext( GenomeLoc genomeLoc ) { + //validateLocation( genomeLoc ); + + GenomeLoc window = genomeLocParser.createGenomeLoc( genomeLoc.getContig(), genomeLoc.getContigIndex(), + getWindowStart(genomeLoc), getWindowStop(genomeLoc) ); + + int refStart = -1; + if (bounds != null) { + window = trimToBounds(window); + refStart = (int)(window.getStart() - getWindowStart(bounds)); + } + else { + if(referenceSequence == null || referenceSequence.getContigIndex() != genomeLoc.getContigIndex()) + referenceSequence = reference.getSequence(genomeLoc.getContig()); + refStart = (int)window.getStart()-1; + } + + int len = (int)window.size(); + return new ReferenceContext( genomeLocParser, genomeLoc, window, new Provider(refStart, len)); + } + + /** + * Allow the user to pull reference info from any arbitrary region of the reference. + * @param genomeLoc The locus. + * @return A list of the bases starting at the start of the locus (inclusive) and ending + * at the end of the locus (inclusive). + */ + public byte[] getReferenceBases( GenomeLoc genomeLoc ) { + return super.getReferenceBases(genomeLoc); + } + + /** + * Gets the start of the expanded window, bounded if necessary by the contig. + * @param locus The locus to expand. + * @return The expanded window. + */ + private int getWindowStart( GenomeLoc locus ) { + // If the locus is not within the bounds of the contig it allegedly maps to, expand only as much as we can. + if(locus.getStart() < 1) return 1; +// if(locus.getStart() < 1) return locus.getStart(); + return Math.max( locus.getStart() + windowStart, 1 ); + } + + /** + * Gets the stop of the expanded window, bounded if necessary by the contig. + * @param locus The locus to expand. + * @return The expanded window. + */ + private int getWindowStop( GenomeLoc locus ) { + // If the locus is not within the bounds of the contig it allegedly maps to, expand only as much as we can. + int sequenceLength = reference.getSequenceDictionary().getSequence(locus.getContig()).getSequenceLength(); + if(locus.getStop() > sequenceLength) return sequenceLength; + return Math.min( locus.getStop() + windowStop, sequenceLength ); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusShardDataProvider.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusShardDataProvider.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusShardDataProvider.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusShardDataProvider.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusView.java new file mode 100644 index 000000000..11437cf2c --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusView.java @@ -0,0 +1,219 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import org.broadinstitute.gatk.engine.ReadProperties; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.locusiterator.LocusIterator; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState; + +import java.util.Arrays; +import java.util.Collection; +import java.util.NoSuchElementException; + +/** + * User: hanna + * Date: May 13, 2009 + * Time: 3:30:16 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * The two goals of the LocusView are as follows: + * 1) To provide a 'trigger track' iteration interface so that TraverseLoci can easily switch + * between iterating over all bases in a region, only covered bases in a region covered by + * reads, only bases in a region covered by RODs, or any other sort of trigger track + * implementation one can think of. + * 2) To manage the copious number of iterators that have to be jointly pulled through the + * genome to make a locus traversal function. + */ +public abstract class LocusView extends LocusIterator implements View { + /** + * The locus bounding this view. + */ + protected GenomeLoc locus; + + /** + * The GenomeLocParser, used to create new genome locs. + */ + protected GenomeLocParser genomeLocParser; + + /** + * Source info for this view. Informs the class about downsampling requirements. + */ + private ReadProperties sourceInfo; + + /** + * The actual locus context iterator. + */ + private LocusIterator loci; + + /** + * The next locus context from the iterator. Lazy loaded: if nextLocus is null and advance() doesn't + * populate it, the iterator is exhausted. If populated, this is the value that should be returned by + * next(). + */ + private AlignmentContext nextLocus = null; + + public LocusView(LocusShardDataProvider provider) { + this.locus = provider.getLocus(); + + this.sourceInfo = provider.getSourceInfo(); + this.genomeLocParser = provider.getGenomeLocParser(); + this.loci = provider.getLocusIterator(); + + advance(); + + provider.register(this); + } + + /** + * Only one view of the locus is supported at any given time. + * @return A list consisting of all other locus views. + */ + public Collection> getConflictingViews() { + return Arrays.>asList(LocusView.class,ReadView.class); + } + + /** + * Close this view. + */ + public void close() { + // Set everything to null with the hope of failing fast. + locus = null; + sourceInfo = null; + loci = null; + + super.close(); + } + + /** + * Is there another covered locus context bounded by this view. + * @return True if another covered locus context exists. False otherwise. + */ + public abstract boolean hasNext(); + + /** + * Returns the next covered locus context in the shard. + * @return Next covered locus context in the shard. + * @throw NoSuchElementException if no such element exists. + */ + public abstract AlignmentContext next(); + + /** + * Unsupported. + * @throw UnsupportedOperationException always. + */ + public void remove() { + throw new UnsupportedOperationException("Unable to remove elements from this queue."); + } + + /** + * Is there another locus context bounded by this shard. + * @return True if another locus context is bounded by this shard. + */ + protected boolean hasNextLocus() { + advance(); + return nextLocus != null; + } + + /** + * Get the next locus context bounded by this shard. + * @return Next locus context bounded by this shard. + * @throw NoSuchElementException if the next element is missing. + */ + protected AlignmentContext nextLocus() { + advance(); + if(nextLocus == null) + throw new NoSuchElementException("No more elements remain in locus context queue."); + + // Cache the current and apply filtering. + AlignmentContext current = nextLocus; + + // Indicate that the next operation will need to advance. + nextLocus = null; + + return current; + } + + /** + * Seed the nextLocus variable with the contents of the next locus (if one exists). + */ + private void advance() { + // Already an unclaimed locus present + if(nextLocus != null) + return; + + //System.out.printf("loci is %s%n", loci); + if( !loci.hasNext() ) { + nextLocus = null; + return; + } + + nextLocus = loci.next(); + + // If the location of this shard is available, trim the data stream to match the shard. + // TODO: Much of this functionality is being replaced by the WindowMaker. + if(locus != null) { + // Iterate through any elements not contained within this shard. + while( nextLocus != null && !isContainedInShard(nextLocus.getLocation()) && loci.hasNext() ) + nextLocus = loci.next(); + + // If nothing in the shard was found, indicate that by setting nextLocus to null. + if( nextLocus != null && !isContainedInShard(nextLocus.getLocation()) ) + nextLocus = null; + } + } + + /** + * Is this location contained in the given shard. + * @param location Location to check. + * @return True if the given location is contained within the shard. False otherwise. + */ + private boolean isContainedInShard(GenomeLoc location) { + return locus.containsP(location); + } + + /** + * {@inheritDoc} + * + * Since this class has an actual LIBS, so this function will never throw an exception + * + * @return the LocusIteratorByState used by this view to get pileups + */ + @Override + public LocusIteratorByState getLIBS() { + return loci.getLIBS(); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ManagingReferenceOrderedView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ManagingReferenceOrderedView.java new file mode 100644 index 000000000..17e8c4290 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ManagingReferenceOrderedView.java @@ -0,0 +1,116 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.utils.LocationAwareSeekableRODIterator; +import org.broadinstitute.gatk.utils.refdata.utils.RODRecordList; +import org.broadinstitute.gatk.utils.GenomeLoc; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +/** + * User: hanna + * Date: May 21, 2009 + * Time: 2:49:17 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * A view into the reference-ordered data in the provider. + */ +public class ManagingReferenceOrderedView implements ReferenceOrderedView { + /** + * The data sources along with their current states. + */ + private List states = new ArrayList(); + + /** + * Create a new view of reference-ordered data. + * @param provider + */ + public ManagingReferenceOrderedView( LocusShardDataProvider provider ) { + for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() ) + states.add(new ReferenceOrderedDataState(dataSource, dataSource.seek(provider.getLocus()))); + + provider.register(this); + } + + public Collection> getConflictingViews() { return Collections.emptyList(); } + + /** + * Gets an object which can track the reference-ordered data at every locus. + * @param loc Locus at which to track. + * @return A tracker containing information about this locus. + */ + @Override + public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ) { + if ( states.isEmpty() ) + return RefMetaDataTracker.EMPTY_TRACKER; + else { + List bindings = new ArrayList(states.size()); + + for ( ReferenceOrderedDataState state: states ) + // todo -- warning, I removed the reference to the name from states + bindings.add( state.iterator.seekForward(loc) ); + + return new RefMetaDataTracker(bindings); + } + } + + /** + * Closes the current view. + */ + public void close() { + for( ReferenceOrderedDataState state: states ) + state.dataSource.close( state.iterator ); + + // Clear out the existing data so that post-close() accesses to this data will fail-fast. + states = null; + } +} + +/** + * Models the traversal state of a given ROD lane. + */ +class ReferenceOrderedDataState { + public final ReferenceOrderedDataSource dataSource; + public final LocationAwareSeekableRODIterator iterator; + + public ReferenceOrderedDataState( ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator ) { + this.dataSource = dataSource; + this.iterator = iterator; + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RODMetaDataContainer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RODMetaDataContainer.java new file mode 100644 index 000000000..197abd49a --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RODMetaDataContainer.java @@ -0,0 +1,83 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import org.broadinstitute.gatk.utils.refdata.utils.GATKFeature; +import org.broadinstitute.gatk.utils.collections.Pair; + +import java.util.*; + + +/** + * + * @author aaron + * + * Class RODMetaDataContainer + * + * stores both the name and the class for each ROD. This class assumes that: + * + * -Names must be unique + * -Classes are allowed to have duplicates + * + * This class encapsulates the ref data associations, and provides lookup by name and by + * class type. + * + */ +public class RODMetaDataContainer { + // we only allow non-duplicate ROD names, a HashMap is fine + private final HashMap nameMap = new HashMap(); + + // we do allow duplicate class entries, so we need to store pairs of data + private final List> classMap = new ArrayList>(); + + public void addEntry(GATKFeature data) { + nameMap.put(data.getName(),data); + classMap.add(new Pair(data.getClass(),data)); + } + + public Collection getSet(String name) { + if (name == null) return getSet(); + Set set = new HashSet(); + if (nameMap.containsKey(name)) set.add(nameMap.get(name)); + return set; + } + + /** + * get the feature contents of this container; the unfiltered set without their name association + * @return + */ + public Collection getSet() { + return new ArrayList(nameMap.values()); + } + + // the brute force (n) search ended up being faster than sorting and binary search in all but the most extreme cases (thousands of RODs at a location). + public Collection getSet(Class cls) { + Collection ret = new ArrayList(); + for (Pair pair: classMap) + if (pair.first.equals(cls)) ret.add(pair.second); + return ret; + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadBasedReferenceOrderedView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadBasedReferenceOrderedView.java new file mode 100644 index 000000000..dea8acf5f --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadBasedReferenceOrderedView.java @@ -0,0 +1,60 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.engine.datasources.reads.ReadShard; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.GenomeLoc; + +/** a ROD view for reads. This provides the Read traversals a way of getting a RefMetaDataTracker */ +public class ReadBasedReferenceOrderedView extends IntervalReferenceOrderedView { + public ReadBasedReferenceOrderedView(final ShardDataProvider provider) { + super(provider, provider.hasReferenceOrderedData() ? ((ReadShard)provider.getShard()).getReadsSpan() : null); + } + + /** + * create a RefMetaDataTracker given the current read + * + * @param rec the read + * + * @return a RefMetaDataTracker for the read, from which you can get ROD -> read alignments + */ + @Requires("rec != null") + @Ensures("result != null") + public RefMetaDataTracker getReferenceOrderedDataForRead(final SAMRecord rec) { + if ( rec.getReadUnmappedFlag() ) + return RefMetaDataTracker.EMPTY_TRACKER; + else { + final GenomeLoc readSpan = genomeLocParser.createGenomeLoc(rec); + trimCurrentFeaturesToLoc(readSpan); + return getReferenceOrderedDataForInterval(readSpan); + } + } +} + diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadReferenceView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadReferenceView.java new file mode 100644 index 000000000..c7b2575be --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadReferenceView.java @@ -0,0 +1,102 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.GenomeLoc; +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * User: hanna + * Date: May 22, 2009 + * Time: 12:36:14 PM + * + */ + +/** Provides access to the reference over a single read. */ + +public class ReadReferenceView extends ReferenceView { + /** + * Create a view of the reference with respect to a single read. + * + * @param provider + */ + public ReadReferenceView( ShardDataProvider provider ) { + super(provider); + } + + protected ReferenceContext.ReferenceContextRefProvider getReferenceBasesProvider( GenomeLoc genomeLoc ) { + return new Provider(genomeLoc); + } + + public class Provider implements ReferenceContext.ReferenceContextRefProvider { + GenomeLoc loc; + + public Provider( GenomeLoc loc ) { + this.loc = loc; + } + + public byte[] getBases() { + return getReferenceBases(loc); + } + } + + /** + * Return a reference context appropriate for the span of read + * + * @param read the mapped read to test + * @return + */ + public ReferenceContext getReferenceContext( final SAMRecord read ) { + GenomeLoc loc = genomeLocParser.createGenomeLoc(read); + return new ReferenceContext( genomeLocParser, loc, loc, getReferenceBasesProvider(loc) ); + } + +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadShardDataProvider.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadShardDataProvider.java new file mode 100644 index 000000000..f9629f5c8 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadShardDataProvider.java @@ -0,0 +1,82 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import org.broadinstitute.gatk.engine.datasources.reads.Shard; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.utils.GenomeLocParser; + +import java.util.Collection; + +/** + * Present data sharded by read to a traversal engine. + * + * @author mhanna + * @version 0.1 + */ +public class ReadShardDataProvider extends ShardDataProvider { + /** + * The raw collection of reads. + */ + private final GATKSAMIterator reads; + + /** + * Create a data provider for the shard given the reads and reference. + * @param shard The chunk of data over which traversals happen. + * @param reference A getter for a section of the reference. + */ + public ReadShardDataProvider(Shard shard, GenomeLocParser genomeLocParser, GATKSAMIterator reads, IndexedFastaSequenceFile reference, Collection rods) { + super(shard,genomeLocParser,reference,rods); + this.reads = reads; + } + + /** + * Can this data source provide reads? + * @return True if reads are available, false otherwise. + */ + public boolean hasReads() { + return reads != null; + } + + /** + * Gets an iterator over all the reads bound by this shard. + * @return An iterator over all reads in this shard. + */ + public GATKSAMIterator getReadIterator() { + return reads; + } + + @Override + public void close() { + super.close(); + + if(reads != null) + reads.close(); + } + +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadView.java new file mode 100644 index 000000000..ec879fdfd --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadView.java @@ -0,0 +1,88 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; + +import java.util.Arrays; +import java.util.Collection; +/** + * User: hanna + * Date: May 22, 2009 + * Time: 12:06:54 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * A view into the reads that a provider can provide. + */ +public class ReadView implements View, Iterable { + /** + * The iterator into the reads supplied by this provider. + */ + private GATKSAMIterator reads; + + /** + * Create a new view of the reads given the current data set. + * @param provider Source for the data. + */ + public ReadView( ReadShardDataProvider provider ) { + reads = provider.getReadIterator(); + } + + /** + * Other reads and loci conflict with this view. + * @return Array of reads and loci. + */ + public Collection> getConflictingViews() { + return Arrays.>asList(ReadView.class, LocusView.class); + } + + /** + * Close the view over these reads. Note that this method closes just + * the view into the reads, not the reads themselves. + */ + public void close() { + // Don't close the reads. The provider is responsible for this. + // Just dispose of the pointer. + reads = null; + } + + /** + * Gets an iterator into the reads supplied by this provider. + * @return Iterator into the reads that this provider covers. + */ + public GATKSAMIterator iterator() { + return reads; + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedView.java new file mode 100644 index 000000000..3be983d4a --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedView.java @@ -0,0 +1,33 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.GenomeLoc; + +public interface ReferenceOrderedView extends View { + RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ); +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceView.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceView.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceView.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RodLocusView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RodLocusView.java new file mode 100644 index 000000000..297ccbedd --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RodLocusView.java @@ -0,0 +1,196 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.utils.LocationAwareSeekableRODIterator; +import org.broadinstitute.gatk.utils.refdata.utils.RODRecordList; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.collections.RODMergingIterator; +import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; + +import java.util.*; + +/** + * A view into the reference-ordered data in the provider. + */ +public class RodLocusView extends LocusView implements ReferenceOrderedView { + /** + * The data sources along with their current states. + */ + private RODMergingIterator rodQueue = null; + + Collection allTracksHere; + + GenomeLoc lastLoc = null; + RODRecordList interval = null; + + /** + * The data sources along with their current states. + */ + private List states = new ArrayList(); + + /** + * Enable debugging output -- todo remove me + */ + final static boolean DEBUG = false; + + final static String INTERVAL_ROD_NAME = "interval"; + + /** + * Create a new view of reference-ordered data. + * + * @param provider + */ + public RodLocusView( LocusShardDataProvider provider ) { + super(provider); + + GenomeLoc loc = provider.getLocus(); + + List< Iterator > iterators = new LinkedList< Iterator >(); + for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() ) { + if ( DEBUG ) System.out.printf("Shard is %s%n", provider.getLocus()); + + // grab the ROD iterator from the data source, and compute the first location in this shard, forwarding + // the iterator to immediately before it, so that it can be added to the merging iterator primed for + // next() to return the first real ROD in this shard + LocationAwareSeekableRODIterator it = dataSource.seek(provider.getLocus()); + it.seekForward(genomeLocParser.createGenomeLoc(loc.getContig(), loc.getStart()-1)); + + states.add(new ReferenceOrderedDataState(dataSource,it)); + + // we need to special case the interval so we don't always think there's a rod at the first location + if ( dataSource.getName().equals(INTERVAL_ROD_NAME) ) { + if ( interval != null ) + throw new RuntimeException("BUG: interval local variable already assigned " + interval); + interval = it.next(); + } else { + iterators.add( it ); + } + } + + rodQueue = new RODMergingIterator(iterators); + } + + @Override + public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ) { + // special case the interval again -- add it into the ROD + if ( interval != null ) { allTracksHere.add(interval); } + return new RefMetaDataTracker(allTracksHere); + } + + public boolean hasNext() { + if ( ! rodQueue.hasNext() ) + return false; + else { + return ! rodQueue.peekLocation().isPast(locus); + } + } + + /** + * Returns the next covered locus context in the shard. + * @return Next covered locus context in the shard. + * @throw NoSuchElementException if no such element exists. + */ + public AlignmentContext next() { + if ( DEBUG ) System.out.printf("In RodLocusView.next()...%n"); + RODRecordList datum = rodQueue.next(); + if ( DEBUG ) System.out.printf("In RodLocusView.next(); datum = %s...%n", datum.getLocation()); + + if ( DEBUG ) System.out.printf("In RodLocusView.next(): creating tracker...%n"); + + allTracksHere = getSpanningTracks(datum); + GenomeLoc rodSite = datum.getLocation(); + GenomeLoc site = genomeLocParser.createGenomeLoc( rodSite.getContig(), rodSite.getStart(), rodSite.getStart()); + + if ( DEBUG ) System.out.printf("rodLocusView.next() is at %s%n", site); + + // calculate the number of skipped bases, and update lastLoc so we can do that again in the next() + long skippedBases = getSkippedBases( rodSite ); + lastLoc = site; + return new AlignmentContext(site, new ReadBackedPileupImpl(site), skippedBases); + } + + private Collection getSpanningTracks(RODRecordList marker) { + return rodQueue.allElementsLTE(marker); + } + + /** + * Returns the number of reference bases that have been skipped: + * + * 1 -- since the last processed location if we have one + * 2 -- from the beginning of the shard if this is the first loc + * 3 -- from the last location to the current position + * + * @param currentPos + * @return + */ + private long getSkippedBases( GenomeLoc currentPos ) { + // the minus - is because if lastLoc == null, you haven't yet seen anything in this interval, so it should also be counted as skipped + Integer compStop = lastLoc == null ? locus.getStart() - 1 : lastLoc.getStop(); + long skippedBases = currentPos.getStart() - compStop - 1; + + if ( skippedBases < -1 ) { // minus 1 value is ok + throw new RuntimeException(String.format("BUG: skipped bases=%d is < 0: cur=%s vs. last=%s, shard=%s", + skippedBases, currentPos, lastLoc, locus)); + } + return Math.max(skippedBases, 0); + } + + /** + * Get the location one after the last position we will traverse through + * @return + */ + public GenomeLoc getLocOneBeyondShard() { + return genomeLocParser.createGenomeLoc(locus.getContig(),locus.getStop()+1); + } + + /** + * How many bases are we skipping from the current location to the end of the interval / shard + * if we have no more elements + * + * @return + */ + public long getLastSkippedBases() { + if ( hasNext() ) + throw new RuntimeException("BUG: getLastSkippedBases called when there are elements remaining."); + + return getSkippedBases(getLocOneBeyondShard()); + } + + /** + * Closes the current view. + */ + public void close() { + for( ReferenceOrderedDataState state: states ) + state.dataSource.close( state.iterator ); + + rodQueue = null; + allTracksHere = null; + } +} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProvider.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProvider.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProvider.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProvider.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/View.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/View.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/View.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/View.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/package-info.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/package-info.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/package-info.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ActiveRegionShardBalancer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ActiveRegionShardBalancer.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ActiveRegionShardBalancer.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ActiveRegionShardBalancer.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMAccessPlan.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMAccessPlan.java new file mode 100644 index 000000000..178d440bf --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMAccessPlan.java @@ -0,0 +1,170 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.util.PeekableIterator; +import htsjdk.samtools.GATKBAMFileSpan; +import htsjdk.samtools.GATKChunk; +import htsjdk.samtools.util.BlockCompressedFilePointerUtil; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; + +import java.util.LinkedList; +import java.util.List; + +/** +* Created by IntelliJ IDEA. +* User: mhanna +* Date: 10/14/11 +* Time: 10:47 PM +* To change this template use File | Settings | File Templates. +*/ +class BAMAccessPlan { + private final SAMReaderID reader; + private final BlockInputStream inputStream; + + private final List positions; + private PeekableIterator positionIterator; + + /** + * Stores the next block address to read, or -1 if no such block is available. + */ + private long nextBlockAddress; + + + BAMAccessPlan(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) { + this.reader = reader; + this.inputStream = inputStream; + + this.positions = fileSpan.getGATKChunks(); + initialize(); + } + + public SAMReaderID getReader() { + return reader; + } + + public BlockInputStream getInputStream() { + return inputStream; + } + + /** + * Retrieves the next block address to be read. + * @return Next block address to be read. + */ + public long getBlockAddress() { + return nextBlockAddress; + } + + /** + * Retrieves the first offset of interest in the block returned by getBlockAddress(). + * @return First block of interest in this segment. + */ + public int getFirstOffsetInBlock() { + return (nextBlockAddress == positionIterator.peek().getBlockStart()) ? positionIterator.peek().getBlockOffsetStart() : 0; + } + + /** + * Gets the spans overlapping the given block; used to copy the contents of the block into the circular buffer. + * @param blockAddress Block address for which to search. + * @param filePosition Block address at which to terminate the last chunk if the last chunk goes beyond this span. + * @return list of chunks containing that block. + */ + public List getSpansOverlappingBlock(long blockAddress, long filePosition) { + List spansOverlapping = new LinkedList(); + // While the position iterator overlaps the given block, pull out spans to report. + while(positionIterator.hasNext() && positionIterator.peek().getBlockStart() <= blockAddress) { + // Create a span over as much of the block as is covered by this chunk. + int blockOffsetStart = (blockAddress == positionIterator.peek().getBlockStart()) ? positionIterator.peek().getBlockOffsetStart() : 0; + + // Calculate the end of this span. If the span extends past this block, cap it using the current file position. + long blockEnd; + int blockOffsetEnd; + if(blockAddress < positionIterator.peek().getBlockEnd()) { + blockEnd = filePosition; + blockOffsetEnd = 0; + } + else { + blockEnd = positionIterator.peek().getBlockEnd(); + blockOffsetEnd = positionIterator.peek().getBlockOffsetEnd(); + } + + GATKChunk newChunk = new GATKChunk(blockAddress,blockOffsetStart,blockEnd,blockOffsetEnd); + + if(newChunk.getChunkStart() <= newChunk.getChunkEnd()) + spansOverlapping.add(new GATKChunk(blockAddress,blockOffsetStart,blockEnd,blockOffsetEnd)); + + // If the value currently stored in the position iterator ends past the current block, we must be done. Abort. + if(!positionIterator.hasNext() || positionIterator.peek().getBlockEnd() > blockAddress) + break; + + // If the position iterator ends before the block ends, pull the position iterator forward. + if(positionIterator.peek().getBlockEnd() <= blockAddress) + positionIterator.next(); + } + + return spansOverlapping; + } + + public void reset() { + initialize(); + } + + /** + * Resets the SAM reader position to its original state. + */ + private void initialize() { + this.positionIterator = new PeekableIterator(positions.iterator()); + if(positionIterator.hasNext()) + nextBlockAddress = positionIterator.peek().getBlockStart(); + else + nextBlockAddress = -1; + } + + /** + * Advances the current position to the next block to read, given the current position in the file. + * @param filePosition The current position within the file. + */ + void advancePosition(final long filePosition) { + nextBlockAddress = BlockCompressedFilePointerUtil.getBlockAddress(filePosition); + + // Check the current file position against the iterator; if the iterator is before the current file position, + // draw the iterator forward. Remember when performing the check that coordinates are half-open! + while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek())) + positionIterator.next(); + + // If the block iterator has shot past the file pointer, bring the file pointer flush with the start of the current block. + if(positionIterator.hasNext() && filePosition < positionIterator.peek().getChunkStart()) + nextBlockAddress = positionIterator.peek().getBlockStart(); + + // If we've shot off the end of the block pointer, notify consumers that iteration is complete. + if(!positionIterator.hasNext()) + nextBlockAddress = -1; + } + + private boolean isFilePositionPastEndOfChunk(final long filePosition, final GATKChunk chunk) { + return filePosition >= chunk.getChunkEnd(); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMSchedule.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMSchedule.java new file mode 100644 index 000000000..aca33e411 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMSchedule.java @@ -0,0 +1,531 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.util.PeekableIterator; +import htsjdk.samtools.Bin; +import htsjdk.samtools.GATKBAMFileSpan; +import htsjdk.samtools.GATKChunk; +import htsjdk.samtools.util.CloseableIterator; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.GATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; + +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.FileChannel; +import java.util.*; + +/** + * Writes schedules for a single BAM file to a target output file. + */ +public class BAMSchedule implements CloseableIterator { + /** + * File in which to store schedule data. + */ + private File scheduleFile; + + /** + * File channel for the schedule file. + */ + private FileChannel scheduleFileChannel; + + /** + * The definitive, sorted list of reader IDs. Order is important here: the order + * in which the reader IDs are presented here maps to the order in which they appear in the file. + */ + private final List readerIDs = new ArrayList(); + + /** + * Iterators over the schedule. Stored in the same order as readerIDs, above. + */ + private final List> scheduleIterators = new ArrayList>(); + + /** + * Next schedule entry to be returned. Null if no additional entries are present. + */ + private BAMScheduleEntry nextScheduleEntry; + + /** + * Reference sequence for which to write the schedule. + */ + private final int referenceSequence; + + /** + * Sizes of ints and longs in bytes. + */ + private static final int INT_SIZE_IN_BYTES = Integer.SIZE / 8; + private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8; + + /** + * Create a new BAM schedule based on the given index. + * @param dataSource The SAM data source to use. + * @param intervals List of + */ + public BAMSchedule(final SAMDataSource dataSource, final List intervals) { + if(intervals.isEmpty()) + throw new ReviewedGATKException("Tried to write schedule for empty interval list."); + + referenceSequence = dataSource.getHeader().getSequence(intervals.get(0).getContig()).getSequenceIndex(); + + createScheduleFile(); + + readerIDs.addAll(dataSource.getReaderIDs()); + + for(final SAMReaderID reader: readerIDs) { + final GATKBAMIndex index = dataSource.getIndex(reader); + final GATKBAMIndexData indexData = index.readReferenceSequence(referenceSequence); + + int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1); + Iterator locusIterator = intervals.iterator(); + GenomeLoc currentLocus = locusIterator.next(); + + final long readerStartOffset = position(); + + int maxChunkCount = 0; + + while(currentBinInLowestLevel < GATKBAMIndex.MAX_BINS && currentLocus != null) { + final Bin bin = new Bin(referenceSequence,currentBinInLowestLevel); + final int binStart = index.getFirstLocusInBin(bin); + final int binStop = index.getLastLocusInBin(bin); + + // In required, pull bin iterator ahead to the point of the next GenomeLoc. + if(binStop < currentLocus.getStart()) { + currentBinInLowestLevel++; + continue; + } + + // At this point, the bin stop is guaranteed to be >= the start of the locus. + // If the bins have gone past the current locus, update the current locus if at all possible. + if(binStart > currentLocus.getStop()) { + currentLocus = locusIterator.hasNext() ? locusIterator.next() : null; + continue; + } + + // Code at this point knows that the current bin is neither before nor after the current locus, + // so it must overlap. Add this region to the filesystem. + final GATKBAMFileSpan fileSpan = indexData.getSpanOverlapping(bin); + + if(!fileSpan.isEmpty()) { + // File format is binary in little endian; start of region, end of region, num chunks, then the chunks themselves. + ByteBuffer buffer = allocateByteBuffer(2*INT_SIZE_IN_BYTES + INT_SIZE_IN_BYTES + fileSpan.getGATKChunks().size()*LONG_SIZE_IN_BYTES*2); + buffer.putInt(binStart); + buffer.putInt(binStop); + buffer.putInt(fileSpan.getGATKChunks().size()); + for(GATKChunk chunk: fileSpan.getGATKChunks()) { + buffer.putLong(chunk.getChunkStart()); + buffer.putLong(chunk.getChunkEnd()); + } + maxChunkCount = Math.max(maxChunkCount,fileSpan.getGATKChunks().size()); + + // Prepare buffer for writing + buffer.flip(); + + // And write. + write(buffer); + } + + currentBinInLowestLevel++; + } + + final long readerStopOffset = position(); + + scheduleIterators.add(new PeekableIterator(new BAMScheduleIterator(reader,readerStartOffset,readerStopOffset,maxChunkCount))); + + // Iterator initialization might move the file pointer. Make sure it gets reset back to where it was before iterator initialization. + position(readerStopOffset); + } + + advance(); + } + + /** + * Determine whether more ScheduleEntries are present in the iterator. + * @return Next schedule entry to parse. + */ + @Override + public boolean hasNext() { + return nextScheduleEntry != null; + } + + /** + * Retrieve the next schedule entry in the list. + * @return next schedule entry in the queue. + */ + @Override + public BAMScheduleEntry next() { + BAMScheduleEntry currentScheduleEntry = nextScheduleEntry; + advance(); + return currentScheduleEntry; + } + + /** + * Close down and delete the file. + */ + @Override + public void close() { + try { + scheduleFileChannel.close(); + } + catch(IOException ex) { + throw makeIOFailureException(true, "Unable to close schedule file.", ex); + } + } + + /** + * Convenience routine for creating UserExceptions + * @param wasWriting + * @param message + * @param e + * @return + */ + private final GATKException makeIOFailureException(final boolean wasWriting, final String message, final Exception e) { + if ( wasWriting ) { + if ( e == null ) + return new UserException.CouldNotCreateOutputFile(scheduleFile, message); + else + return new UserException.CouldNotCreateOutputFile(scheduleFile, message, e); + } else { + if ( e == null ) + return new UserException.CouldNotReadInputFile(scheduleFile, message); + else + return new UserException.CouldNotReadInputFile(scheduleFile, message, e); + } + } + + /** + * Advance to the next schedule entry. + */ + private void advance() { + nextScheduleEntry = null; + + BitSet selectedIterators = new BitSet(readerIDs.size()); + int currentStart = Integer.MAX_VALUE; + int currentStop = Integer.MAX_VALUE; + + // Select every iterator whose next element is the lowest element in the list. + for(int reader = 0; reader < scheduleIterators.size(); reader++) { + PeekableIterator scheduleIterator = scheduleIterators.get(reader); + if(!scheduleIterator.hasNext()) + continue; + + // If the iterator starts after this one, skip over it. + if(scheduleIterator.peek().start > currentStart) + continue; + + // If the iterator starts at the same point as this one, add it to the list. + if(scheduleIterator.peek().start == currentStart) { + selectedIterators.set(reader); + currentStop = Math.min(scheduleIterator.peek().stop,currentStop); + continue; + } + + // If the iterator is less than anything seen before it, purge the selections and make this one current. + if(scheduleIterator.peek().start < currentStart) { + selectedIterators.clear(); + selectedIterators.set(reader); + currentStart = scheduleIterator.peek().start; + currentStop = scheduleIterator.peek().stop; + } + } + + // Out of iterators? Abort early. + if(selectedIterators.isEmpty()) + return; + + // Create the target schedule entry + BAMScheduleEntry mergedScheduleEntry = new BAMScheduleEntry(currentStart,currentStop); + + // For each schedule entry with data, load the data into the merged schedule. + for (int reader = selectedIterators.nextSetBit(0); reader >= 0; reader = selectedIterators.nextSetBit(reader+1)) { + PeekableIterator scheduleIterator = scheduleIterators.get(reader); + BAMScheduleEntry individualScheduleEntry = scheduleIterator.peek(); + mergedScheduleEntry.mergeInto(individualScheduleEntry); + + // If the schedule iterator ends after this entry, consume it. + if(individualScheduleEntry.stop <= currentStop) + scheduleIterator.next(); + } + + // For each schedule entry without data, add a blank entry. + for (int reader = selectedIterators.nextClearBit(0); reader < readerIDs.size(); reader = selectedIterators.nextClearBit(reader+1)) { + mergedScheduleEntry.addFileSpan(readerIDs.get(reader),new GATKBAMFileSpan()); + } + + nextScheduleEntry = mergedScheduleEntry; + } + + @Override + public void remove() { throw new UnsupportedOperationException("Unable to remove from a schedule iterator."); } + + /** + * Create a new schedule file, containing schedule information for all BAM files being dynamically merged. + */ + private void createScheduleFile() { + try { + scheduleFile = File.createTempFile("bamschedule."+referenceSequence,null); + scheduleFileChannel = new RandomAccessFile(scheduleFile,"rw").getChannel(); + } + catch(IOException ex) { + throw new UserException("Unable to create a temporary BAM schedule file. Please make sure Java can write to the default temp directory or use -Djava.io.tmpdir= to instruct it to use a different temp directory instead.",ex); + } + scheduleFile.deleteOnExit(); + + } + + /** + * Creates a new byte buffer of the given size. + * @param size the size of buffer to allocate. + * @return Newly allocated byte buffer. + */ + private ByteBuffer allocateByteBuffer(final int size) { + ByteBuffer buffer = ByteBuffer.allocate(size); + buffer.order(ByteOrder.LITTLE_ENDIAN); + return buffer; + } + + /** + * Reads the contents at the current position on disk into the given buffer. + * @param buffer buffer to fill. + */ + private int read(final ByteBuffer buffer) { + try { + return scheduleFileChannel.read(buffer); + } + catch(IOException ex) { + throw makeIOFailureException(false, "Unable to read data from BAM schedule file.", ex); + } + } + + private void write(final ByteBuffer buffer) { + try { + scheduleFileChannel.write(buffer); + if(buffer.remaining() > 0) + throw makeIOFailureException(true, "Unable to write entire buffer to file.", null); + } + catch(IOException ex) { + throw makeIOFailureException(true, "Unable to write data to BAM schedule file.", ex); + } + } + + /** + * Reads the current position from the file channel. + * @return Current position within file channel. + */ + private long position() { + try { + return scheduleFileChannel.position(); + } + catch(IOException ex) { + throw makeIOFailureException(false, "Unable to retrieve position of BAM schedule file.", ex); + } + } + + /** + * Reposition the file channel to the specified offset wrt the start of the file. + * @param position The position. + */ + private void position(final long position) { + try { + scheduleFileChannel.position(position); + } + catch(IOException ex) { + throw makeIOFailureException(false, "Unable to position BAM schedule file.",ex); + } + } + + /** + * An iterator over the schedule for a single BAM file. + */ + private class BAMScheduleIterator implements Iterator { + /** + * ID of the reader associated with the given schedule. + */ + private final SAMReaderID reader; + + /** + * Current position in the file. + */ + private long currentPosition; + + /** + * Stopping file position of last bin in file for this reader, exclusive. + */ + private final long stopPosition; + + /** + * Byte buffer used to store BAM header info. + */ + private final ByteBuffer binHeader; + + /** + * Byte buffer used to store chunk data. + */ + private final ByteBuffer chunkData; + + public BAMScheduleIterator(final SAMReaderID reader, final long startPosition, final long stopPosition, final int maxChunkCount) { + this.reader = reader; + this.currentPosition = startPosition; + this.stopPosition = stopPosition; + binHeader = allocateByteBuffer(INT_SIZE_IN_BYTES*3); + chunkData = allocateByteBuffer(maxChunkCount*LONG_SIZE_IN_BYTES*2); + } + + @Override + public boolean hasNext() { + return currentPosition < stopPosition; + } + + @Override + public BAMScheduleEntry next() { + position(currentPosition); + + // Read data. + int binHeaderBytesRead = read(binHeader); + + // Make sure we read in a complete bin header: + if ( binHeaderBytesRead < INT_SIZE_IN_BYTES * 3 ) { + throw new ReviewedGATKException(String.format("Unable to read a complete bin header from BAM schedule file %s for BAM file %s. " + + "The BAM schedule file is likely incomplete/corrupt.", + scheduleFile.getAbsolutePath(), reader.getSamFilePath())); + } + + // Decode contents. + binHeader.flip(); + final int start = binHeader.getInt(); + final int stop = binHeader.getInt(); + final int numChunks = binHeader.getInt(); + + // Prepare bin buffer for next read. + binHeader.flip(); + + // Prepare a target buffer for chunks. + GATKChunk[] chunks = new GATKChunk[numChunks]; + + // Read all chunk data. + chunkData.limit(numChunks*LONG_SIZE_IN_BYTES*2); + long bytesRead = read(chunkData); + if(bytesRead != numChunks*LONG_SIZE_IN_BYTES*2) + throw new ReviewedGATKException("Unable to read all chunks from file"); + + // Prepare for reading. + chunkData.flip(); + + for(int i = 0; i < numChunks; i++) + chunks[i] = new GATKChunk(chunkData.getLong(),chunkData.getLong()); + + // Prepare chunk buffer for next read. + chunkData.flip(); + + BAMScheduleEntry nextScheduleEntry = new BAMScheduleEntry(start,stop); + nextScheduleEntry.addFileSpan(reader,new GATKBAMFileSpan(chunks)); + + // Reset the position of the iterator at the next contig. + currentPosition = position(); + + return nextScheduleEntry; + } + + /** + * Not supported. + */ + @Override + public void remove() { + throw new UnsupportedOperationException("Unable to remove from a BAMScheduleIterator"); + } + + } +} + +/** + * A single proto-shard to be processed. + */ +class BAMScheduleEntry { + /** + * Starting position for the genomic entry. + */ + public final int start; + + /** + * Ending position for the genomic entry. + */ + public final int stop; + + /** + * The spans representing the given region. + */ + public final Map fileSpans = new HashMap(); + + BAMScheduleEntry(final int start, final int stop) { + this.start = start; + this.stop = stop; + } + + /** + * Add a new file span to this schedule. + * @param reader Reader associated with the span. + * @param fileSpan Blocks to read in the given reader. + */ + public void addFileSpan(final SAMReaderID reader, final GATKBAMFileSpan fileSpan) { + fileSpans.put(reader,fileSpan); + } + + /** + * A naive merge operation. Merge the fileSpans in other into this, blowing up if conflicts are + * detected. Completely ignores merging start and stop. + * @param other Other schedule entry to merging into this one. + */ + public void mergeInto(final BAMScheduleEntry other) { + final int thisSize = fileSpans.size(); + final int otherSize = other.fileSpans.size(); + fileSpans.putAll(other.fileSpans); + if(fileSpans.size() != thisSize+otherSize) + throw new ReviewedGATKException("Unable to handle overlaps when merging BAM schedule entries."); + } + + /** + * Returns true if the location of this bin tree is before the given position. + * @param locus Locus to test. + * @return True if this bin sits completely before the given locus; false otherwise. + */ + public boolean isBefore(final GenomeLoc locus) { + return stop < locus.getStart(); + } + + /** + * Checks overlap between this bin tree and other bin trees. + * @param position the position over which to detect overlap. + * @return True if the segment overlaps. False otherwise. + */ + public boolean overlaps(final GenomeLoc position) { + return !(position.getStop() < start || position.getStart() > stop); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMScheduler.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMScheduler.java new file mode 100644 index 000000000..f916bc185 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMScheduler.java @@ -0,0 +1,321 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.util.PeekableIterator; +import htsjdk.samtools.GATKBAMFileSpan; +import htsjdk.samtools.GATKChunk; +import htsjdk.samtools.SAMSequenceRecord; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.GenomeLocSortedSet; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; +import org.broadinstitute.gatk.utils.sam.ReadUtils; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; + +import java.util.*; + +/** + * Assign intervals to the most appropriate blocks, keeping as little as possible in memory at once. + */ +public class BAMScheduler implements Iterator { + private final SAMDataSource dataSource; + + private final Map indexFiles = new HashMap(); + + private FilePointer nextFilePointer = null; + + private GenomeLocSortedSet loci; + private PeekableIterator locusIterator; + private GenomeLoc currentLocus; + private IntervalMergingRule intervalMergingRule; + + /* + * Creates BAMScheduler using contigs from the given BAM data source. + * + * @param dataSource BAM source + * @return non-null BAM scheduler + */ + public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource) { + final BAMScheduler scheduler = new BAMScheduler(dataSource, IntervalMergingRule.ALL); + final GenomeLocSortedSet intervals = GenomeLocSortedSet.createSetFromSequenceDictionary(dataSource.getHeader().getSequenceDictionary()); + scheduler.populateFilteredIntervalList(intervals); + return scheduler; + } + + public static BAMScheduler createOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) { + BAMScheduler scheduler = new BAMScheduler(dataSource, IntervalMergingRule.ALL); + scheduler.populateUnfilteredIntervalList(parser); + return scheduler; + } + + public static BAMScheduler createOverIntervals(final SAMDataSource dataSource, final IntervalMergingRule mergeRule, final GenomeLocSortedSet loci) { + BAMScheduler scheduler = new BAMScheduler(dataSource, mergeRule); + scheduler.populateFilteredIntervalList(loci); + return scheduler; + } + + + private BAMScheduler(final SAMDataSource dataSource, final IntervalMergingRule mergeRule) { + this.dataSource = dataSource; + this.intervalMergingRule = mergeRule; + for(SAMReaderID reader: dataSource.getReaderIDs()) { + GATKBAMIndex index = dataSource.getIndex(reader); + if(index != null) + indexFiles.put(reader,dataSource.getIndex(reader)); + } + } + + /** + * The consumer has asked for a bounded set of locations. Prepare an iterator over those locations. + * @param loci The list of locations to search and iterate over. + */ + private void populateFilteredIntervalList(final GenomeLocSortedSet loci) { + this.loci = loci; + if(!indexFiles.isEmpty()) { + // If index data is available, start up the iterator. + locusIterator = new PeekableIterator(loci.iterator()); + if(locusIterator.hasNext()) + currentLocus = locusIterator.next(); + advance(); + } + else { + // Otherwise, seed the iterator with a single file pointer over the entire region. + nextFilePointer = generatePointerOverEntireFileset(); + for(GenomeLoc locus: loci) + nextFilePointer.addLocation(locus); + locusIterator = new PeekableIterator(Collections.emptyList().iterator()); + } + } + + /** + * The consumer has provided null, meaning to iterate over all available data. Create a file pointer stretching + * from just before the start of the region to the end of the region. + */ + private void populateUnfilteredIntervalList(final GenomeLocParser parser) { + this.loci = new GenomeLocSortedSet(parser); + locusIterator = new PeekableIterator(Collections.emptyList().iterator()); + nextFilePointer = generatePointerOverEntireFileset(); + } + + /** + * Generate a span that runs from the end of the BAM header to the end of the fle. + * @return A file pointer over the specified region. + */ + private FilePointer generatePointerOverEntireFileset() { + FilePointer filePointer = new FilePointer(intervalMergingRule); + + // This is a "monolithic" FilePointer representing all regions in all files we will ever visit, and is + // the only FilePointer we will create. This allows us to have this FilePointer represent regions from + // multiple contigs + filePointer.setIsMonolithic(true); + + Map currentPosition; + + currentPosition = dataSource.getInitialReaderPositions(); + + for(SAMReaderID reader: dataSource.getReaderIDs()) + filePointer.addFileSpans(reader,createSpanToEndOfFile(currentPosition.get(reader).getGATKChunks().get(0).getChunkStart())); + return filePointer; + } + + public boolean hasNext() { + return nextFilePointer != null; + } + + public FilePointer next() { + if(!hasNext()) + throw new NoSuchElementException("No next element available in interval sharder"); + FilePointer currentFilePointer = nextFilePointer; + nextFilePointer = null; + advance(); + + return currentFilePointer; + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove FilePointers from an IntervalSharder"); + } + + private void advance() { + if(loci.isEmpty()) + return; + + while(nextFilePointer == null && currentLocus != null) { + // special case handling of the unmapped shard. + if(currentLocus == GenomeLoc.UNMAPPED) { + nextFilePointer = new FilePointer(intervalMergingRule, GenomeLoc.UNMAPPED); + for(SAMReaderID id: dataSource.getReaderIDs()) + nextFilePointer.addFileSpans(id,createSpanToEndOfFile(indexFiles.get(id).getStartOfLastLinearBin())); + currentLocus = null; + continue; + } + + nextFilePointer = new FilePointer(intervalMergingRule); + + int coveredRegionStart = 1; + int coveredRegionStop = Integer.MAX_VALUE; + GenomeLoc coveredRegion = null; + + BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(currentLocus); + + // No overlapping data at all. + if(scheduleEntry != null) { + coveredRegionStart = Math.max(coveredRegionStart,scheduleEntry.start); + coveredRegionStop = Math.min(coveredRegionStop,scheduleEntry.stop); + coveredRegion = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStart,coveredRegionStop); + + nextFilePointer.addFileSpans(scheduleEntry.fileSpans); + } + else { + // Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty. + for(SAMReaderID reader: indexFiles.keySet()) + nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan()); + } + + // Early exit if no bins were found. + if(coveredRegion == null) { + // for debugging only: maximum split is 16384. + nextFilePointer.addLocation(currentLocus); + currentLocus = locusIterator.hasNext() ? locusIterator.next() : null; + continue; + } + + // Early exit if only part of the first interval was found. + if(currentLocus.startsBefore(coveredRegion)) { + int splitPoint = Math.min(coveredRegion.getStart()-currentLocus.getStart(),16384)+currentLocus.getStart(); + GenomeLoc[] splitContigs = currentLocus.split(splitPoint); + nextFilePointer.addLocation(splitContigs[0]); + currentLocus = splitContigs[1]; + continue; + } + + // Define the initial range of the file pointer, aka the region where the locus currently being processed intersects the BAM list. + GenomeLoc initialLocation = currentLocus.intersect(coveredRegion); + nextFilePointer.addLocation(initialLocation); + + // See whether the BAM regions discovered overlap the next set of intervals in the interval list. If so, include every overlapping interval. + if(!nextFilePointer.locations.isEmpty()) { + while(locusIterator.hasNext() && locusIterator.peek().overlapsP(coveredRegion)) { + currentLocus = locusIterator.next(); + nextFilePointer.addLocation(currentLocus.intersect(coveredRegion)); + } + + // Chop off the uncovered portion of the locus. Since we know that the covered region overlaps the current locus, + // we can simplify the interval creation process to the end of the covered region to the stop of the given interval. + if(coveredRegionStop < currentLocus.getStop()) + currentLocus = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStop+1,currentLocus.getStop()); + else if(locusIterator.hasNext()) + currentLocus = locusIterator.next(); + else + currentLocus = null; + } + + } + } + + + /** + * The last reference sequence processed by this iterator. + */ + private Integer lastReferenceSequenceLoaded = null; + + /** + * The stateful iterator used to progress through the genoem. + */ + private PeekableIterator bamScheduleIterator = null; + + /** + * Clean up underlying BAMSchedule file handles. + */ + public void close() { + if(bamScheduleIterator != null) + bamScheduleIterator.close(); + } + + /** + * Get the next overlapping tree of bins associated with the given BAM file. + * @param currentLocus The actual locus for which to check overlap. + * @return The next schedule entry overlapping with the given list of loci. + */ + private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final GenomeLoc currentLocus) { + // Make sure that we consult the BAM header to ensure that we're using the correct contig index for this contig name. + // This will ensure that if the two sets of contigs don't quite match (b36 male vs female ref, hg19 Epstein-Barr), then + // we'll be using the correct contig index for the BAMs. + // TODO: Warning: assumes all BAMs use the same sequence dictionary! Get around this with contig aliasing. + SAMSequenceRecord currentContigSequenceRecord = dataSource.getHeader().getSequence(currentLocus.getContig()); + if ( currentContigSequenceRecord == null ) { + throw new UserException(String.format("Contig %s not present in sequence dictionary for merged BAM header: %s", + currentLocus.getContig(), + ReadUtils.prettyPrintSequenceRecords(dataSource.getHeader().getSequenceDictionary()))); + } + + final int currentContigIndex = currentContigSequenceRecord.getSequenceIndex(); + + // Stale reference sequence or first invocation. (Re)create the binTreeIterator. + if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentContigIndex) { + if(bamScheduleIterator != null) + bamScheduleIterator.close(); + lastReferenceSequenceLoaded = currentContigIndex; + + // Naive algorithm: find all elements in current contig for proper schedule creation. + List lociInContig = new LinkedList(); + for(GenomeLoc locus: loci) { + if (!GenomeLoc.isUnmapped(locus) && dataSource.getHeader().getSequence(locus.getContig()) == null) + throw new ReviewedGATKException("BAM file(s) do not have the contig: " + locus.getContig() + ". You are probably using a different reference than the one this file was aligned with"); + + if (!GenomeLoc.isUnmapped(locus) && dataSource.getHeader().getSequence(locus.getContig()).getSequenceIndex() == lastReferenceSequenceLoaded) + lociInContig.add(locus); + } + + bamScheduleIterator = new PeekableIterator(new BAMSchedule(dataSource,lociInContig)); + } + + if(!bamScheduleIterator.hasNext()) + return null; + + // Peek the iterator along until finding the first binTree at or following the current locus. + BAMScheduleEntry bamScheduleEntry = bamScheduleIterator.peek(); + while(bamScheduleEntry != null && bamScheduleEntry.isBefore(currentLocus)) { + bamScheduleIterator.next(); + bamScheduleEntry = bamScheduleIterator.hasNext() ? bamScheduleIterator.peek() : null; + } + + return (bamScheduleEntry != null && bamScheduleEntry.overlaps(currentLocus)) ? bamScheduleEntry : null; + } + + /** + * Create a span from the given start point to the end of the file. + * @param startOfRegion Start of the region, in encoded coordinates (block start << 16 & block offset). + * @return A file span from the given point to the end of the file. + */ + private GATKBAMFileSpan createSpanToEndOfFile(final long startOfRegion) { + return new GATKBAMFileSpan(new GATKChunk(startOfRegion,Long.MAX_VALUE)); + } + +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BGZFBlockLoadingDispatcher.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BGZFBlockLoadingDispatcher.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BGZFBlockLoadingDispatcher.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BGZFBlockLoadingDispatcher.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockInputStream.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockInputStream.java new file mode 100644 index 000000000..125d4f731 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockInputStream.java @@ -0,0 +1,451 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.GATKBAMFileSpan; +import htsjdk.samtools.GATKChunk; +import htsjdk.samtools.util.BlockCompressedInputStream; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + +/** + * Presents decompressed blocks to the SAMFileReader. + */ +public class BlockInputStream extends InputStream { + /** + * Mechanism for triggering block loads. + */ + private final BGZFBlockLoadingDispatcher dispatcher; + + /** + * The reader whose data is supplied by this input stream. + */ + private final SAMReaderID reader; + + /** + * Length of the input stream. + */ + private final long length; + + /** + * The latest error reported by an asynchronous block load. + */ + private Throwable error; + + /** + * Current accessPlan. + */ + private BAMAccessPlan accessPlan; + + /** + * A stream of compressed data blocks. + */ + private final ByteBuffer buffer; + + /** + * Offsets of the given blocks in the buffer. + */ + private LinkedList blockOffsets = new LinkedList(); + + /** + * Source positions of the given blocks in the buffer. + */ + private LinkedList blockPositions = new LinkedList(); + + /** + * Provides a lock to wait for more data to arrive. + */ + private final Object lock = new Object(); + + /** + * An input stream to use when comparing data back to what it should look like. + */ + private final BlockCompressedInputStream validatingInputStream; + + /** + * Create a new block presenting input stream with a dedicated buffer. + * @param dispatcher the block loading messenger. + * @param reader the reader for which to load data. + * @param validate validates the contents read into the buffer against the contents of a Picard BlockCompressedInputStream. + */ + BlockInputStream(final BGZFBlockLoadingDispatcher dispatcher, final SAMReaderID reader, final boolean validate) { + this.reader = reader; + this.length = reader.getSamFile().length(); + + buffer = ByteBuffer.wrap(new byte[64*1024]); + buffer.order(ByteOrder.LITTLE_ENDIAN); + + // The state of the buffer assumes that the range of data written into the buffer appears in the range + // [position,limit), while extra capacity exists in the range [limit,capacity) + buffer.limit(0); + + this.dispatcher = dispatcher; + // TODO: Kill the region when all we want to do is start at the beginning of the stream and run to the end of the stream. + this.accessPlan = new BAMAccessPlan(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE))); + + // The block offsets / block positions guarantee that the ending offset/position in the data structure maps to + // the point in the file just following the last read. These two arrays should never be empty; initializing + // to 0 to match the position above. + this.blockOffsets.add(0); + this.blockPositions.add(0L); + + try { + if(validate) { + System.out.printf("BlockInputStream %s: BGZF block validation mode activated%n",this); + validatingInputStream = new BlockCompressedInputStream(reader.getSamFile()); + // A bug in ValidatingInputStream means that calling getFilePointer() immediately after initialization will result in an NPE. + // Poke the stream to start reading data. + validatingInputStream.available(); + } + else + validatingInputStream = null; + } + catch(IOException ex) { + throw new ReviewedGATKException("Unable to validate against Picard input stream",ex); + } + } + + public long length() { + return length; + } + + public long getFilePointer() { + long filePointer; + synchronized(lock) { + // Find the current block within the input stream. + int blockIndex; + for(blockIndex = 0; blockIndex+1 < blockOffsets.size() && buffer.position() > blockOffsets.get(blockIndex+1); blockIndex++) + ; + filePointer = blockPositions.get(blockIndex) + (buffer.position()-blockOffsets.get(blockIndex)); + } + +// if(validatingInputStream != null && filePointer != validatingInputStream.getFilePointer()) +// throw new ReviewedGATKException(String.format("Position of input stream is invalid; expected (block address, block offset) = (%d,%d), got (%d,%d)", +// BlockCompressedFilePointerUtil.getBlockAddress(validatingInputStream.getFilePointer()),BlockCompressedFilePointerUtil.getBlockOffset(validatingInputStream.getFilePointer()), +// BlockCompressedFilePointerUtil.getBlockAddress(filePointer),BlockCompressedFilePointerUtil.getBlockOffset(filePointer))); + + return filePointer; + } + + private void clearBuffers() { + this.accessPlan.reset(); + + // Buffer semantics say that outside of a lock, buffer should always be prepared for reading. + // Indicate no data to be read. + buffer.clear(); + buffer.limit(0); + + // Clear everything except the last block offset / position + blockOffsets.clear(); + blockOffsets.add(0); + while(blockPositions.size() > 1) + blockPositions.removeFirst(); + } + + public boolean eof() { + synchronized(lock) { + // TODO: Handle multiple empty BGZF blocks at end of the file. + return accessPlan != null && (accessPlan.getBlockAddress() < 0 || accessPlan.getBlockAddress() >= length); + } + } + + /** + * Submits a new access plan for the given dataset and seeks to the given point. + * @param accessPlan The next seek point for BAM data in this reader. + */ + public void submitAccessPlan(final BAMAccessPlan accessPlan) { + //System.out.printf("Thread %s: submitting access plan for block at position: %d%n",Thread.currentThread().getId(),position.getBlockAddress()); + this.accessPlan = accessPlan; + accessPlan.reset(); + + clearBuffers(); + + // Pull the iterator past any oddball chunks at the beginning of the shard (chunkEnd < chunkStart, empty chunks, etc). + // TODO: Don't pass these empty chunks in. + accessPlan.advancePosition(makeFilePointer(accessPlan.getBlockAddress(),0)); + + if(accessPlan.getBlockAddress() >= 0) { + waitForBufferFill(); + } + + if(validatingInputStream != null) { + try { + validatingInputStream.seek(makeFilePointer(accessPlan.getBlockAddress(),0)); + } + catch(IOException ex) { + throw new ReviewedGATKException("Unable to validate against Picard input stream",ex); + } + } + + } + + + private void compactBuffer() { + // Compact buffer to maximize storage space. + int bytesToRemove = 0; + + // Look ahead to see if we can compact away the first blocks in the series. + while(blockOffsets.size() > 1 && buffer.position() >= blockOffsets.get(1)) { + blockOffsets.remove(); + blockPositions.remove(); + bytesToRemove = blockOffsets.peek(); + } + + // If we end up with an empty block at the end of the series, compact this as well. + if(buffer.remaining() == 0 && blockOffsets.size() > 1 && buffer.position() >= blockOffsets.peek()) { + bytesToRemove += buffer.position(); + blockOffsets.remove(); + blockPositions.remove(); + } + + int finalBufferStart = buffer.position() - bytesToRemove; + int finalBufferSize = buffer.remaining(); + + // Position the buffer to remove the unneeded data, and compact it away. + buffer.position(bytesToRemove); + buffer.compact(); + + // Reset the limits for reading. + buffer.position(finalBufferStart); + buffer.limit(finalBufferStart+finalBufferSize); + + // Shift everything in the offset buffer down to accommodate the bytes removed from the buffer. + for(int i = 0; i < blockOffsets.size(); i++) + blockOffsets.set(i,blockOffsets.get(i)-bytesToRemove); + } + + /** + * Push contents of incomingBuffer into the end of this buffer. + * MUST be called from a thread that is NOT the reader thread. + * @param incomingBuffer The data being pushed into this input stream. + * @param accessPlan target access plan for the data. + * @param filePosition the current position of the file pointer + */ + public void copyIntoBuffer(final ByteBuffer incomingBuffer, final BAMAccessPlan accessPlan, final long filePosition) { + synchronized(lock) { + try { + if(validatingInputStream != null) { + byte[] validBytes = new byte[incomingBuffer.remaining()]; + + byte[] currentBytes = new byte[incomingBuffer.remaining()]; + int pos = incomingBuffer.position(); + int lim = incomingBuffer.limit(); + incomingBuffer.get(currentBytes); + + incomingBuffer.limit(lim); + incomingBuffer.position(pos); + + long currentFilePointer = validatingInputStream.getFilePointer(); + validatingInputStream.seek(makeFilePointer(accessPlan.getBlockAddress(), 0)); + validatingInputStream.read(validBytes); + validatingInputStream.seek(currentFilePointer); + + if(!Arrays.equals(validBytes,currentBytes)) + throw new ReviewedGATKException(String.format("Bytes being inserted into BlockInputStream %s are incorrect",this)); + } + + compactBuffer(); + // Open up the buffer for more reading. + buffer.limit(buffer.capacity()); + + // Get the spans overlapping this particular block... + List spansOverlapping = accessPlan.getSpansOverlappingBlock(accessPlan.getBlockAddress(),filePosition); + + // ...and advance the block + this.accessPlan = accessPlan; + accessPlan.advancePosition(makeFilePointer(filePosition, 0)); + + if(buffer.remaining() < incomingBuffer.remaining()) + lock.wait(); + + final int bytesInIncomingBuffer = incomingBuffer.limit(); + + for(GATKChunk spanOverlapping: spansOverlapping) { + // Clear out the endcap tracking state and add in the starting position for this transfer. + blockOffsets.removeLast(); + blockOffsets.add(buffer.position()); + blockPositions.removeLast(); + blockPositions.add(spanOverlapping.getChunkStart()); + + // Stream the buffer into the data stream. + incomingBuffer.limit((spanOverlapping.getBlockEnd() > spanOverlapping.getBlockStart()) ? bytesInIncomingBuffer : spanOverlapping.getBlockOffsetEnd()); + incomingBuffer.position(spanOverlapping.getBlockOffsetStart()); + buffer.put(incomingBuffer); + + // Add the endcap for this transfer. + blockOffsets.add(buffer.position()); + blockPositions.add(spanOverlapping.getChunkEnd()); + } + + // Set up the buffer for reading. + buffer.flip(); + + lock.notify(); + } + catch(Exception ex) { + reportException(ex); + lock.notify(); + } + } + } + + void reportException(Throwable t) { + synchronized(lock) { + this.error = t; + lock.notify(); + } + } + + private void checkForErrors() { + synchronized(lock) { + if(error != null) { + ReviewedGATKException toThrow = new ReviewedGATKException(String.format("Thread %s, BlockInputStream %s: Unable to retrieve BAM data from disk",Thread.currentThread().getId(),this),error); + toThrow.setStackTrace(error.getStackTrace()); + throw toThrow; + } + } + } + + /** + * Reads the next byte of data from the input stream. + * @return Next byte of data, from 0->255, as an int. + */ + @Override + public int read() { + byte[] singleByte = new byte[1]; + read(singleByte); + return singleByte[0]; + } + + /** + * Fills the given byte array to the extent possible. + * @param bytes byte array to be filled. + * @return The number of bytes actually read. + */ + @Override + public int read(byte[] bytes) { + return read(bytes,0,bytes.length); + } + + @Override + public int read(byte[] bytes, final int offset, final int length) { + int remaining = length; + synchronized(lock) { + while(remaining > 0) { + // Check for error conditions during last read. + checkForErrors(); + + // If completely out of space, queue up another buffer fill. + waitForBufferFill(); + + // Couldn't manage to load any data at all; abort and return what's available. + if(buffer.remaining() == 0) + break; + + int numBytesToCopy = Math.min(buffer.remaining(),remaining); + buffer.get(bytes,length-remaining+offset,numBytesToCopy); + remaining -= numBytesToCopy; + + //if(remaining > 0) + // System.out.printf("Thread %s: read the first %d bytes of a %d byte request%n",Thread.currentThread().getId(),length-remaining,length); + // TODO: Assert that we don't copy across a block boundary + } + + // Notify any waiting threads that some of the contents of the buffer were removed. + if(length-remaining > 0) + lock.notify(); + } + +// if(validatingInputStream != null) { +// byte[] validBytes = new byte[length]; +// try { +// validatingInputStream.read(validBytes,offset,length); +// for(int i = offset; i < offset+length; i++) { +// if(bytes[i] != validBytes[i]) +// throw new ReviewedGATKException(String.format("Thread %s: blockInputStream %s attempting to return wrong set of bytes; mismatch at offset %d",Thread.currentThread().getId(),this,i)); +// } +// } +// catch(IOException ex) { +// throw new ReviewedGATKException("Unable to validate against Picard input stream",ex); +// } +// } + + // If any data was copied into the buffer, return the amount of data copied. + if(remaining < length) + return length - remaining; + + // Otherwise, return -1. + return -1; + } + + public void close() { + if(validatingInputStream != null) { + try { + validatingInputStream.close(); + } + catch(IOException ex) { + throw new ReviewedGATKException("Unable to validate against Picard input stream",ex); + } + } + } + + public String getSource() { + return reader.getSamFilePath(); + } + + private void waitForBufferFill() { + synchronized(lock) { + if(buffer.remaining() == 0 && !eof()) { + //System.out.printf("Thread %s is waiting for a buffer fill from position %d to buffer %s%n",Thread.currentThread().getId(),position.getBlockAddress(),this); + dispatcher.queueBlockLoad(accessPlan); + try { + lock.wait(); + } + catch(InterruptedException ex) { + throw new ReviewedGATKException("Interrupt occurred waiting for buffer to fill",ex); + } + } + } + } + + /** + * Create an encoded BAM file pointer given the address of a BGZF block and an offset. + * @param blockAddress Physical address on disk of a BGZF block. + * @param blockOffset Offset into the uncompressed data stored in the BGZF block. + * @return 64-bit pointer encoded according to the BAM spec. + */ + public static long makeFilePointer(final long blockAddress, final int blockOffset) { + return blockAddress << 16 | blockOffset; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockLoader.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockLoader.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockLoader.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockLoader.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FileHandleCache.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FileHandleCache.java new file mode 100644 index 000000000..7f6653888 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FileHandleCache.java @@ -0,0 +1,229 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.GATKException; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; + +import java.io.FileInputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; + +/** + * Caches frequently used file handles. Right now, caches only a single file handle. + * TODO: Generalize to support arbitrary file handle caches. + */ +public class FileHandleCache { + /** + * The underlying data structure storing file handles. + */ + private final FileHandleStorage fileHandleStorage; + + /** + * How many file handles should be kept open at once. + */ + private final int cacheSize; + + /** + * A uniquifier: assign a unique ID to every instance of a file handle. + */ + private final Map keyCounter = new HashMap(); + + /** + * A shared lock, private so that outside users cannot notify it. + */ + private final Object lock = new Object(); + + /** + * Indicates how many file handles are outstanding at this point. + */ + private int numOutstandingFileHandles = 0; + + /** + * Create a new file handle cache of the given cache size. + * @param cacheSize how many readers to hold open at once. + */ + public FileHandleCache(final int cacheSize) { + this.cacheSize = cacheSize; + fileHandleStorage = new FileHandleStorage(); + } + + /** + * Retrieves or opens a file handle for the given reader ID. + * @param key The ke + * @return A file input stream from the cache, if available, or otherwise newly opened. + */ + public FileInputStream claimFileInputStream(final SAMReaderID key) { + synchronized(lock) { + FileInputStream inputStream = findExistingEntry(key); + if(inputStream == null) { + try { + // If the cache is maxed out, wait for another file handle to emerge. + if(numOutstandingFileHandles >= cacheSize) + lock.wait(); + } + catch(InterruptedException ex) { + throw new ReviewedGATKException("Interrupted while waiting for a file handle"); + } + inputStream = openInputStream(key); + } + numOutstandingFileHandles++; + + //System.out.printf("Handing input stream %s to thread %s%n",inputStream,Thread.currentThread().getId()); + return inputStream; + } + } + + /** + * Releases the current reader and returns it to the cache. + * @param key The reader. + * @param inputStream The stream being used. + */ + public void releaseFileInputStream(final SAMReaderID key, final FileInputStream inputStream) { + synchronized(lock) { + numOutstandingFileHandles--; + UniqueKey newID = allocateKey(key); + fileHandleStorage.put(newID,inputStream); + // Let any listeners know that another file handle has become available. + lock.notify(); + } + } + + /** + * Finds an existing entry in the storage mechanism. + * @param key Reader. + * @return a cached stream, if available. Otherwise, + */ + private FileInputStream findExistingEntry(final SAMReaderID key) { + int existingHandles = getMostRecentUniquifier(key); + + // See if any of the keys currently exist in the repository. + for(int i = 0; i <= existingHandles; i++) { + UniqueKey uniqueKey = new UniqueKey(key,i); + if(fileHandleStorage.containsKey(uniqueKey)) + return fileHandleStorage.remove(uniqueKey); + } + + return null; + } + + /** + * Gets the most recent uniquifier used for the given reader. + * @param reader Reader for which to determine uniqueness. + * @return + */ + private int getMostRecentUniquifier(final SAMReaderID reader) { + if(keyCounter.containsKey(reader)) + return keyCounter.get(reader); + else return -1; + } + + private UniqueKey allocateKey(final SAMReaderID reader) { + int uniquifier = getMostRecentUniquifier(reader)+1; + keyCounter.put(reader,uniquifier); + return new UniqueKey(reader,uniquifier); + } + + private FileInputStream openInputStream(final SAMReaderID reader) { + try { + return new FileInputStream(reader.getSamFilePath()); + } + catch(IOException ex) { + throw new GATKException("Unable to open input file"); + } + } + + private void closeInputStream(final FileInputStream inputStream) { + try { + inputStream.close(); + } + catch(IOException ex) { + throw new GATKException("Unable to open input file"); + } + } + + /** + * Actually contains the file handles, purging them as they get too old. + */ + private class FileHandleStorage extends LinkedHashMap { + /** + * Remove the oldest entry + * @param entry Entry to consider removing. + * @return True if the cache size has been exceeded. False otherwise. + */ + @Override + protected boolean removeEldestEntry(Map.Entry entry) { + synchronized (lock) { + if(size() > cacheSize) { + keyCounter.put(entry.getKey().key,keyCounter.get(entry.getKey().key)-1); + closeInputStream(entry.getValue()); + + return true; + } + } + return false; + } + } + + /** + * Uniquifies a key by adding a numerical uniquifier. + */ + private class UniqueKey { + /** + * The file handle's key. + */ + private final SAMReaderID key; + + /** + * A uniquifier, so that multiple of the same reader can exist in the cache. + */ + private final int uniqueID; + + public UniqueKey(final SAMReaderID reader, final int uniqueID) { + this.key = reader; + this.uniqueID = uniqueID; + } + + @Override + public boolean equals(Object other) { + if(!(other instanceof UniqueKey)) + return false; + UniqueKey otherUniqueKey = (UniqueKey)other; + return key.equals(otherUniqueKey.key) && this.uniqueID == otherUniqueKey.uniqueID; + } + + @Override + public int hashCode() { + return key.hashCode(); + } + } + + + +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointer.java new file mode 100644 index 000000000..4ea4aabf9 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointer.java @@ -0,0 +1,437 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.util.PeekableIterator; +import htsjdk.samtools.GATKBAMFileSpan; +import htsjdk.samtools.GATKChunk; +import htsjdk.samtools.SAMFileSpan; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; +import org.broadinstitute.gatk.utils.interval.IntervalUtils; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; + +import java.util.*; + +/** + * Represents a small section of a BAM file, and every associated interval. + */ +public class FilePointer { + protected final SortedMap fileSpans = new TreeMap(); + protected final List locations = new ArrayList(); + protected final IntervalMergingRule intervalMergingRule; + + /** + * Does this file pointer point into an unmapped region? + */ + protected final boolean isRegionUnmapped; + + /** + * Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will + * ever visit during this GATK run? If this is set to true, the engine will expect to see only this + * one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals + * from more than one contig. + */ + private boolean isMonolithic = false; + + /** + * Index of the contig covered by this FilePointer. Only meaningful for non-monolithic, mapped FilePointers + */ + private Integer contigIndex = null; + + + public FilePointer( final IntervalMergingRule mergeRule, final List locations ) { + this.intervalMergingRule = mergeRule; + this.locations.addAll(locations); + this.isRegionUnmapped = checkUnmappedStatus(); + + validateAllLocations(); + if ( locations.size() > 0 ) { + contigIndex = locations.get(0).getContigIndex(); + } + } + + public FilePointer( final IntervalMergingRule mergeRule, final GenomeLoc... locations ) { + this(mergeRule, Arrays.asList(locations)); + } + + public FilePointer( final Map fileSpans, final IntervalMergingRule mergeRule, final List locations ) { + this(mergeRule, locations); + this.fileSpans.putAll(fileSpans); + } + + private boolean checkUnmappedStatus() { + boolean foundMapped = false, foundUnmapped = false; + + for( GenomeLoc location: locations ) { + if ( GenomeLoc.isUnmapped(location) ) + foundUnmapped = true; + else + foundMapped = true; + } + if ( foundMapped && foundUnmapped ) + throw new ReviewedGATKException("BUG: File pointers cannot be mixed mapped/unmapped."); + + return foundUnmapped; + } + + private void validateAllLocations() { + // Unmapped and monolithic FilePointers are exempted from the one-contig-only restriction + if ( isRegionUnmapped || isMonolithic ) { + return; + } + + Integer previousContigIndex = null; + + for ( GenomeLoc location : locations ) { + if ( previousContigIndex != null && previousContigIndex != location.getContigIndex() ) { + throw new ReviewedGATKException("Non-monolithic file pointers must contain intervals from at most one contig"); + } + + previousContigIndex = location.getContigIndex(); + } + } + + private void validateLocation( GenomeLoc location ) { + if ( isRegionUnmapped != GenomeLoc.isUnmapped(location) ) { + throw new ReviewedGATKException("BUG: File pointers cannot be mixed mapped/unmapped."); + } + if ( ! isRegionUnmapped && ! isMonolithic && contigIndex != null && contigIndex != location.getContigIndex() ) { + throw new ReviewedGATKException("Non-monolithic file pointers must contain intervals from at most one contig"); + } + } + + /** + * Returns an immutable view of this FilePointer's file spans + * + * @return an immutable view of this FilePointer's file spans + */ + public Map getFileSpans() { + return Collections.unmodifiableMap(fileSpans); + } + + /** + * Returns an immutable variant of the list of locations. + * @return + */ + public List getLocations() { + return Collections.unmodifiableList(locations); + } + + /** + * Returns the index of the contig into which this FilePointer points (a FilePointer can represent + * regions in at most one contig). + * + * @return the index of the contig into which this FilePointer points + */ + public int getContigIndex() { + return locations.size() > 0 ? locations.get(0).getContigIndex() : SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; + } + + /** + * Returns the IntervalMergingRule used by this FilePointer to merge adjacent locations + * + * @return the IntervalMergingRule used by this FilePointer (never null) + */ + public IntervalMergingRule getIntervalMergingRule() { + return intervalMergingRule; + } + + /** + * Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will + * ever visit during this GATK run? If this is set to true, the engine will expect to see only this + * one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals + * from more than one contig. + * + * @return true if this FP is a monolithic FP representing all regions in all files, otherwise false + */ + public boolean isMonolithic() { + return isMonolithic; + } + + /** + * Set this FP's "monolithic" status to true or false. An FP is monolithic if it represents all + * regions in all files that we will ever visit, and is the only FP we will ever create. A monolithic + * FP may contain intervals from more than one contig. + * + * @param isMonolithic set this FP's monolithic status to this value + */ + public void setIsMonolithic( boolean isMonolithic ) { + this.isMonolithic = isMonolithic; + } + + @Override + public boolean equals(final Object other) { + if(!(other instanceof FilePointer)) + return false; + FilePointer otherFilePointer = (FilePointer)other; + + // intervals + if(this.locations.size() != otherFilePointer.locations.size()) + return false; + for(int i = 0; i < locations.size(); i++) { + if(!this.locations.get(i).equals(otherFilePointer.locations.get(i))) + return false; + } + + // fileSpans + if(this.fileSpans.size() != otherFilePointer.fileSpans.size()) + return false; + Iterator> thisEntries = this.fileSpans.entrySet().iterator(); + Iterator> otherEntries = otherFilePointer.fileSpans.entrySet().iterator(); + while(thisEntries.hasNext() || otherEntries.hasNext()) { + if(!thisEntries.next().equals(otherEntries.next())) + return false; + } + + return true; + } + + public void addLocation(final GenomeLoc location) { + validateLocation(location); + + this.locations.add(location); + if ( contigIndex == null ) { + contigIndex = location.getContigIndex(); + } + } + + public void addFileSpans(final SAMReaderID id, final SAMFileSpan fileSpan) { + this.fileSpans.put(id,fileSpan); + } + + public void addFileSpans(final Map fileSpans) { + this.fileSpans.putAll(fileSpans); + } + + + /** + * Computes the size of this file span, in uncompressed bytes. + * @return Size of the file span. + */ + public long size() { + long size = 0L; + for(SAMFileSpan fileSpan: fileSpans.values()) + size += ((GATKBAMFileSpan)fileSpan).size(); + return size; + } + + /** + * Returns the difference in size between two filespans. + * @param other Other filespan against which to measure. + * @return The difference in size between the two file pointers. + */ + public long minus(final FilePointer other) { + long difference = 0; + PeekableIterator> thisIterator = new PeekableIterator>(this.fileSpans.entrySet().iterator()); + PeekableIterator> otherIterator = new PeekableIterator>(other.fileSpans.entrySet().iterator()); + + while(thisIterator.hasNext()) { + // If there are no elements left in the 'other' iterator, spin out this iterator. + if(!otherIterator.hasNext()) { + GATKBAMFileSpan nextSpan = (GATKBAMFileSpan)thisIterator.next().getValue(); + difference += nextSpan.size(); + continue; + } + + // Otherwise, compare the latest value. + int compareValue = thisIterator.peek().getKey().compareTo(otherIterator.peek().getKey()); + + if(compareValue < 0) { + // This before other. + difference += ((GATKBAMFileSpan)thisIterator.next().getValue()).size(); + } + else if(compareValue > 0) { + // Other before this. + difference += ((GATKBAMFileSpan)otherIterator.next().getValue()).size(); + } + else { + // equality; difference the values. + GATKBAMFileSpan thisRegion = (GATKBAMFileSpan)thisIterator.next().getValue(); + GATKBAMFileSpan otherRegion = (GATKBAMFileSpan)otherIterator.next().getValue(); + difference += Math.abs(thisRegion.minus(otherRegion).size()); + } + } + return difference; + } + + /** + * Combines two file pointers into one. + * @param parser The genomelocparser to use when manipulating intervals. + * @param other File pointer to combine into this one. + * @return A completely new file pointer that is the combination of the two. + */ + public FilePointer combine(final GenomeLocParser parser, final FilePointer other) { + FilePointer combined = new FilePointer(intervalMergingRule); + + List intervals = new ArrayList(); + intervals.addAll(locations); + intervals.addAll(other.locations); + for(GenomeLoc interval: IntervalUtils.sortAndMergeIntervals(parser,intervals,intervalMergingRule)) + combined.addLocation(interval); + + PeekableIterator> thisIterator = new PeekableIterator>(this.fileSpans.entrySet().iterator()); + PeekableIterator> otherIterator = new PeekableIterator>(other.fileSpans.entrySet().iterator()); + + while(thisIterator.hasNext() || otherIterator.hasNext()) { + int compareValue; + if(!otherIterator.hasNext()) { + compareValue = -1; + } + else if(!thisIterator.hasNext()) + compareValue = 1; + else + compareValue = thisIterator.peek().getKey().compareTo(otherIterator.peek().getKey()); + + // This before other. + if(compareValue < 0) + mergeElementsInto(combined,thisIterator); + // Other before this. + else if(compareValue > 0) + mergeElementsInto(combined,otherIterator); + // equality; union the values. + else + mergeElementsInto(combined,thisIterator,otherIterator); + } + return combined; + } + + /** + * Roll the next element in the iterator into the combined entry. + * @param combined Entry into which to roll the next element. + * @param iterators Sources of next elements. + */ + private void mergeElementsInto(final FilePointer combined, Iterator>... iterators) { + if(iterators.length == 0) + throw new ReviewedGATKException("Tried to add zero elements to an existing file pointer."); + Map.Entry initialElement = iterators[0].next(); + GATKBAMFileSpan fileSpan = (GATKBAMFileSpan)initialElement.getValue(); + for(int i = 1; i < iterators.length; i++) + fileSpan = fileSpan.union((GATKBAMFileSpan)iterators[i].next().getValue()); + combined.addFileSpans(initialElement.getKey(),fileSpan); + } + + /** + * Efficiently generate the union of the n FilePointers passed in. Much more efficient than + * combining two FilePointers at a time using the combine() method above. + * + * IMPORTANT: the FilePointers to be unioned must either all represent regions on the + * same contig, or all be unmapped, since we cannot create FilePointers with a mix of + * contigs or with mixed mapped/unmapped regions. + * + * @param filePointers the FilePointers to union + * @param parser our GenomeLocParser + * @return the union of the FilePointers passed in + */ + public static FilePointer union( List filePointers, GenomeLocParser parser ) { + if ( filePointers == null || filePointers.isEmpty() ) { + return new FilePointer(IntervalMergingRule.ALL); + } + + Map> fileChunks = new HashMap>(); + List locations = new ArrayList(); + IntervalMergingRule mergeRule = filePointers.get(0).getIntervalMergingRule(); + + // First extract all intervals and file chunks from the FilePointers into unsorted, unmerged collections + for ( FilePointer filePointer : filePointers ) { + locations.addAll(filePointer.getLocations()); + if (mergeRule != filePointer.getIntervalMergingRule()) + throw new ReviewedGATKException("All FilePointers in FilePointer.union() must have use the same IntervalMergeRule"); + + for ( Map.Entry fileSpanEntry : filePointer.getFileSpans().entrySet() ) { + GATKBAMFileSpan fileSpan = (GATKBAMFileSpan)fileSpanEntry.getValue(); + + if ( fileChunks.containsKey(fileSpanEntry.getKey()) ) { + fileChunks.get(fileSpanEntry.getKey()).addAll(fileSpan.getGATKChunks()); + } + else { + fileChunks.put(fileSpanEntry.getKey(), fileSpan.getGATKChunks()); + } + } + } + + // Now sort and merge the intervals + List sortedMergedLocations = new ArrayList(); + sortedMergedLocations.addAll(IntervalUtils.sortAndMergeIntervals(parser, locations, mergeRule)); + + // For each BAM file, convert from an unsorted, unmerged list of chunks to a GATKBAMFileSpan containing + // the sorted, merged union of the chunks for that file + Map mergedFileSpans = new HashMap(fileChunks.size()); + for ( Map.Entry> fileChunksEntry : fileChunks.entrySet() ) { + List unmergedChunks = fileChunksEntry.getValue(); + mergedFileSpans.put(fileChunksEntry.getKey(), + (new GATKBAMFileSpan(unmergedChunks.toArray(new GATKChunk[unmergedChunks.size()]))).union(new GATKBAMFileSpan())); + } + + return new FilePointer(mergedFileSpans, mergeRule, sortedMergedLocations); + } + + /** + * Returns true if any of the file spans in this FilePointer overlap their counterparts in + * the other FilePointer. "Overlap" is defined as having an overlapping extent (the region + * from the start of the first chunk to the end of the last chunk). + * + * @param other the FilePointer against which to check overlap with this FilePointer + * @return true if any file spans overlap their counterparts in other, otherwise false + */ + public boolean hasFileSpansOverlappingWith( FilePointer other ) { + for ( Map.Entry thisFilePointerEntry : fileSpans.entrySet() ) { + GATKBAMFileSpan thisFileSpan = new GATKBAMFileSpan(thisFilePointerEntry.getValue()); + + SAMFileSpan otherEntry = other.fileSpans.get(thisFilePointerEntry.getKey()); + if ( otherEntry == null ) { + continue; // no counterpart for this file span in other + } + GATKBAMFileSpan otherFileSpan = new GATKBAMFileSpan(otherEntry); + + if ( thisFileSpan.getExtent().overlaps(otherFileSpan.getExtent()) ) { + return true; + } + } + + return false; + } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("FilePointer:\n"); + builder.append("\tlocations = {"); + builder.append(Utils.join(";",locations)); + builder.append("}\n\tregions = \n"); + for(Map.Entry entry: fileSpans.entrySet()) { + builder.append(entry.getKey()); + builder.append("= {"); + builder.append(entry.getValue()); + builder.append("}"); + } + return builder.toString(); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndex.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndex.java new file mode 100644 index 000000000..3b94e438a --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndex.java @@ -0,0 +1,464 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.Bin; +import htsjdk.samtools.GATKBin; +import htsjdk.samtools.GATKChunk; +import htsjdk.samtools.LinearIndex; +import htsjdk.samtools.seekablestream.SeekableBufferedStream; +import htsjdk.samtools.seekablestream.SeekableFileStream; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * A basic interface for querying BAM indices. + * Very much not thread-safe. + * + * @author mhanna + * @version 0.1 + */ +public class GATKBAMIndex { + /** + * BAM index file magic number. + */ + private static final byte[] BAM_INDEX_MAGIC = "BAI\1".getBytes(); + + /** + * Reports the total amount of genomic data that any bin can index. + */ + protected static final int BIN_GENOMIC_SPAN = 512*1024*1024; + + /** + * What is the starting bin for each level? + */ + private static final int[] LEVEL_STARTS = {0,1,9,73,585,4681}; + + /** + * Reports the maximum number of bins that can appear in a BAM file. + */ + public static final int MAX_BINS = 37450; // =(8^6-1)/7+1 + + private final File mFile; + + //TODO: figure out a good value for this buffer size + private final int BUFFERED_STREAM_BUFFER_SIZE = 8192; + + /** + * Number of sequences stored in this index. + */ + private final int sequenceCount; + + /** + * A cache of the starting positions of the sequences. + */ + private final long[] sequenceStartCache; + + private SeekableFileStream fileStream; + private SeekableBufferedStream bufferedStream; + private long fileLength; + + public GATKBAMIndex(final File file) { + mFile = file; + // Open the file stream. + openIndexFile(); + + // Verify the magic number. + seek(0); + final byte[] buffer = readBytes(4); + if (!Arrays.equals(buffer, BAM_INDEX_MAGIC)) { + throw new ReviewedGATKException("Invalid file header in BAM index " + mFile + + ": " + new String(buffer)); + } + + seek(4); + + sequenceCount = readInteger(); + + // Create a cache of the starting position of each sequence. Initialize it to -1. + sequenceStartCache = new long[sequenceCount]; + for(int i = 1; i < sequenceCount; i++) + sequenceStartCache[i] = -1; + + // Seed the first element in the array with the current position. + if(sequenceCount > 0) + sequenceStartCache[0] = position(); + + closeIndexFile(); + } + + public GATKBAMIndexData readReferenceSequence(final int referenceSequence) { + openIndexFile(); + + if (referenceSequence >= sequenceCount) + throw new ReviewedGATKException("Invalid sequence number " + referenceSequence + " in index file " + mFile); + + skipToSequence(referenceSequence); + + int binCount = readInteger(); + List bins = new ArrayList(); + for (int binNumber = 0; binNumber < binCount; binNumber++) { + final int indexBin = readInteger(); + final int nChunks = readInteger(); + + List chunks = new ArrayList(nChunks); + long[] rawChunkData = readLongs(nChunks*2); + for (int ci = 0; ci < nChunks; ci++) { + final long chunkBegin = rawChunkData[ci*2]; + final long chunkEnd = rawChunkData[ci*2+1]; + chunks.add(new GATKChunk(chunkBegin, chunkEnd)); + } + GATKBin bin = new GATKBin(referenceSequence, indexBin); + bin.setChunkList(chunks.toArray(new GATKChunk[chunks.size()])); + while(indexBin >= bins.size()) + bins.add(null); + bins.set(indexBin,bin); + } + + final int nLinearBins = readInteger(); + long[] linearIndexEntries = readLongs(nLinearBins); + + LinearIndex linearIndex = new LinearIndex(referenceSequence,0,linearIndexEntries); + + closeIndexFile(); + + return new GATKBAMIndexData(this,referenceSequence,bins,linearIndex); + } + + /** + * Get the number of levels employed by this index. + * @return Number of levels in this index. + */ + public static int getNumIndexLevels() { + return LEVEL_STARTS.length; + } + + /** + * Gets the first bin in the given level. + * @param levelNumber Level number. 0-based. + * @return The first bin in this level. + */ + public static int getFirstBinInLevel(final int levelNumber) { + return LEVEL_STARTS[levelNumber]; + } + + /** + * Gets the number of bins in the given level. + * @param levelNumber Level number. 0-based. + * @return The size (number of possible bins) of the given level. + */ + public int getLevelSize(final int levelNumber) { + if(levelNumber == getNumIndexLevels()-1) + return MAX_BINS-LEVEL_STARTS[levelNumber]-1; + else + return LEVEL_STARTS[levelNumber+1]-LEVEL_STARTS[levelNumber]; + } + + /** + * Gets the level associated with the given bin number. + * @param bin The bin for which to determine the level. + * @return the level associated with the given bin number. + */ + public int getLevelForBin(final Bin bin) { + GATKBin gatkBin = new GATKBin(bin); + if(gatkBin.getBinNumber() >= MAX_BINS) + throw new ReviewedGATKException("Tried to get level for invalid bin in index file " + mFile); + for(int i = getNumIndexLevels()-1; i >= 0; i--) { + if(gatkBin.getBinNumber() >= LEVEL_STARTS[i]) + return i; + } + throw new ReviewedGATKException("Unable to find correct bin for bin " + bin + " in index file " + mFile); + } + + /** + * Gets the first locus that this bin can index into. + * @param bin The bin to test. + * @return The last position that the given bin can represent. + */ + public int getFirstLocusInBin(final Bin bin) { + final int level = getLevelForBin(bin); + final int levelStart = LEVEL_STARTS[level]; + final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart; + return (new GATKBin(bin).getBinNumber() - levelStart)*(BIN_GENOMIC_SPAN /levelSize)+1; + } + + /** + * Gets the last locus that this bin can index into. + * @param bin The bin to test. + * @return The last position that the given bin can represent. + */ + public int getLastLocusInBin(final Bin bin) { + final int level = getLevelForBin(bin); + final int levelStart = LEVEL_STARTS[level]; + final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart; + return (new GATKBin(bin).getBinNumber()-levelStart+1)*(BIN_GENOMIC_SPAN /levelSize); + } + + /** + * Use to get close to the unmapped reads at the end of a BAM file. + * @return The file offset of the first record in the last linear bin, or -1 + * if there are no elements in linear bins (i.e. no mapped reads). + */ + public long getStartOfLastLinearBin() { + openIndexFile(); + + seek(4); + + final int sequenceCount = readInteger(); + // Because no reads may align to the last sequence in the sequence dictionary, + // grab the last element of the linear index for each sequence, and return + // the last one from the last sequence that has one. + long lastLinearIndexPointer = -1; + for (int i = 0; i < sequenceCount; i++) { + // System.out.println("# Sequence TID: " + i); + final int nBins = readInteger(); + // System.out.println("# nBins: " + nBins); + for (int j1 = 0; j1 < nBins; j1++) { + // Skip bin # + skipBytes(4); + final int nChunks = readInteger(); + // Skip chunks + skipBytes(16 * nChunks); + } + final int nLinearBins = readInteger(); + if (nLinearBins > 0) { + // Skip to last element of list of linear bins + skipBytes(8 * (nLinearBins - 1)); + lastLinearIndexPointer = readLongs(1)[0]; + } + } + + closeIndexFile(); + + return lastLinearIndexPointer; + } + + /** + * Gets the possible number of bins for a given reference sequence. + * @return How many bins could possibly be used according to this indexing scheme to index a single contig. + */ + protected int getMaxAddressibleGenomicLocation() { + return BIN_GENOMIC_SPAN; + } + + protected void skipToSequence(final int referenceSequence) { + // Find the offset in the file of the last sequence whose position has been determined. Start here + // when searching the sequence for the next value to read. (Note that sequenceStartCache[0] will always + // be present, so no extra stopping condition is necessary. + int sequenceIndex = referenceSequence; + while(sequenceStartCache[sequenceIndex] == -1) + sequenceIndex--; + + // Advance to the most recently found position. + seek(sequenceStartCache[sequenceIndex]); + + for (int i = sequenceIndex; i < referenceSequence; i++) { + sequenceStartCache[i] = position(); + // System.out.println("# Sequence TID: " + i); + final int nBins = readInteger(); + // System.out.println("# nBins: " + nBins); + for (int j = 0; j < nBins; j++) { + final int bin = readInteger(); + final int nChunks = readInteger(); + // System.out.println("# bin[" + j + "] = " + bin + ", nChunks = " + nChunks); + skipBytes(16 * nChunks); + } + final int nLinearBins = readInteger(); + // System.out.println("# nLinearBins: " + nLinearBins); + skipBytes(8 * nLinearBins); + + } + + sequenceStartCache[referenceSequence] = position(); + } + + + + private void openIndexFile() { + try { + fileStream = new SeekableFileStream(mFile); + bufferedStream = new SeekableBufferedStream(fileStream,BUFFERED_STREAM_BUFFER_SIZE); + fileLength=bufferedStream.length(); + } + catch (IOException exc) { + throw new ReviewedGATKException("Unable to open index file (" + exc.getMessage() +")" + mFile, exc); + } + } + + private void closeIndexFile() { + try { + bufferedStream.close(); + fileStream.close(); + fileLength = -1; + } + catch (IOException exc) { + throw new ReviewedGATKException("Unable to close index file " + mFile, exc); + } + } + + private static final int INT_SIZE_IN_BYTES = Integer.SIZE / 8; + private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8; + + private byte[] readBytes(int count) { + ByteBuffer buffer = getBuffer(count); + read(buffer); + buffer.flip(); + byte[] contents = new byte[count]; + buffer.get(contents); + return contents; + } + + private int readInteger() { + ByteBuffer buffer = getBuffer(INT_SIZE_IN_BYTES); + read(buffer); + buffer.flip(); + return buffer.getInt(); + } + + /** + * Reads an array of longs from the file channel, returning the results as an array. + * @param count Number of longs to read. + * @return An array of longs. Size of array should match count. + */ + private long[] readLongs(final int count) { + ByteBuffer buffer = getBuffer(count*LONG_SIZE_IN_BYTES); + read(buffer); + buffer.flip(); + long[] result = new long[count]; + for(int i = 0; i < count; i++) + result[i] = buffer.getLong(); + return result; + } + + private void read(final ByteBuffer buffer) { + final int bytesRequested = buffer.limit(); + if (bytesRequested == 0) + return; + + try { + + //BufferedInputStream cannot read directly into a byte buffer, so we read into an array + //and put the result into the bytebuffer after the if statement. + + // We have a rigid expectation here to read in exactly the number of bytes we've limited + // our buffer to -- if there isn't enough data in the file, the index + // must be truncated or otherwise corrupt: + if(bytesRequested > fileLength - bufferedStream.position()){ + throw new UserException.MalformedFile(mFile, String.format("Premature end-of-file while reading BAM index file %s. " + + "It's likely that this file is truncated or corrupt -- " + + "Please try re-indexing the corresponding BAM file.", + mFile)); + } + + int bytesRead = bufferedStream.read(byteArray, 0, bytesRequested); + + // We have a rigid expectation here to read in exactly the number of bytes we've limited + // our buffer to -- if we encounter EOF (-1), the index + // must be truncated or otherwise corrupt: + if (bytesRead <= 0) { + throw new UserException.MalformedFile(mFile, String.format("Premature end-of-file while reading BAM index file %s. " + + "It's likely that this file is truncated or corrupt -- " + + "Please try re-indexing the corresponding BAM file.", + mFile)); + } + + if(bytesRead != bytesRequested) + throw new RuntimeException("Read amount different from requested amount. This should not happen."); + + buffer.put(byteArray, 0, bytesRequested); + } + catch(IOException ex) { + throw new ReviewedGATKException("Index: unable to read bytes from index file " + mFile); + } + } + + + /** + * A reusable buffer for use by this index generator. + * TODO: Should this be a SoftReference? + */ + private ByteBuffer buffer = null; + + //BufferedStream don't read into ByteBuffers, so we need this temporary array + private byte[] byteArray=null; + private ByteBuffer getBuffer(final int size) { + if(buffer == null || buffer.capacity() < size) { + // Allocate a new byte buffer. For now, make it indirect to make sure it winds up on the heap for easier debugging. + buffer = ByteBuffer.allocate(size); + byteArray = new byte[size]; + buffer.order(ByteOrder.LITTLE_ENDIAN); + } + buffer.clear(); + buffer.limit(size); + return buffer; + } + + private void skipBytes(final int count) { + try { + + //try to skip forward the requested amount. + long skipped = bufferedStream.skip(count); + + if( skipped != count ) { //if not managed to skip the requested amount + throw new ReviewedGATKException("Index: unable to reposition file channel of index file " + mFile); + } + } + catch(IOException ex) { + throw new ReviewedGATKException("Index: unable to reposition file channel of index file " + mFile); + } + } + + private void seek(final long position) { + try { + //to seek a new position, move the fileChannel, and reposition the bufferedStream + bufferedStream.seek(position); + } + catch(IOException ex) { + throw new ReviewedGATKException("Index: unable to reposition of file channel of index file " + mFile); + } + } + + /** + * Retrieve the position from the current file channel. + * @return position of the current file channel. + */ + private long position() { + try { + return bufferedStream.position(); + } + catch (IOException exc) { + throw new ReviewedGATKException("Unable to read position from index file " + mFile, exc); + } + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexData.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexData.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexData.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexData.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalOverlapFilteringIterator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalOverlapFilteringIterator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalOverlapFilteringIterator.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalOverlapFilteringIterator.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalSharder.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalSharder.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalSharder.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalSharder.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/LocusShard.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/LocusShard.java new file mode 100644 index 000000000..4714df9b7 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/LocusShard.java @@ -0,0 +1,61 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.SAMFileSpan; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; + +import java.util.List; +import java.util.Map; + +/** + * Handles locus shards of BAM information. + * @author aaron + * @version 1.0 + * @date Apr 7, 2009 + */ +public class LocusShard extends Shard { + /** + * Create a new locus shard, divided by index. + * @param intervals List of intervals to process. + * @param fileSpans File spans associated with that interval. + */ + public LocusShard(GenomeLocParser parser, SAMDataSource dataSource, List intervals, Map fileSpans) { + super(parser, ShardType.LOCUS, intervals, dataSource, fileSpans, false); + } + + /** + * String representation of this shard. + * @return A string representation of the boundaries of this shard. + */ + @Override + public String toString() { + return Utils.join(";",getGenomeLocs()); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/LocusShardBalancer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/LocusShardBalancer.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/LocusShardBalancer.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/LocusShardBalancer.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShard.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShard.java new file mode 100644 index 000000000..d8ae3bf55 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShard.java @@ -0,0 +1,271 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.util.PeekableIterator; +import htsjdk.samtools.*; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIteratorAdapter; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; + +import java.util.*; + +/** + * + * User: aaron + * Date: Apr 10, 2009 + * Time: 5:03:13 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + +/** + * Expresses a shard of read data in block format. + * + * @author mhanna + * @version 0.1 + */ +public class ReadShard extends Shard { + + /** + * Default read shard buffer size + */ + public static final int DEFAULT_MAX_READS = 10000; + + /** + * What is the maximum number of reads per BAM file which should go into a read shard. + * + * TODO: this non-final static variable should either be made final or turned into an + * TODO: instance variable somewhere -- as both static and mutable it wreaks havoc + * TODO: with tests that use multiple instances of SAMDataSource (since SAMDataSource + * TODO: changes this value) + */ + public static int MAX_READS = DEFAULT_MAX_READS; + + /** + * The reads making up this shard. + */ + private final Collection reads = new ArrayList(MAX_READS); + + public ReadShard(GenomeLocParser parser, SAMDataSource readsDataSource, Map fileSpans, List loci, boolean isUnmapped) { + super(parser, ShardType.READ, loci, readsDataSource, fileSpans, isUnmapped); + } + + /** + * Sets the maximum number of reads buffered in a read shard. Implemented as a weirdly static interface + * until we know what effect tuning this parameter has. + * + * TODO: this mutable static interface is awful and breaks tests -- need to refactor + * + * @param bufferSize New maximum number + */ + static void setReadBufferSize(final int bufferSize) { + MAX_READS = bufferSize; + } + + /** + * What read buffer size are we using? + * + * @return + */ + public static int getReadBufferSize() { + return MAX_READS; + } + + /** + * Returns true if this shard is meant to buffer reads, rather + * than just holding pointers to their locations. + * @return True if this shard can buffer reads. False otherwise. + */ + public boolean buffersReads() { + return true; + } + + /** + * Returns true if the read buffer is currently full. + * @return True if this shard's buffer is full (and the shard can buffer reads). + */ + public boolean isBufferEmpty() { + return reads.size() == 0; + } + + /** + * Returns true if the read buffer is currently full. + * @return True if this shard's buffer is full (and the shard can buffer reads). + */ + public boolean isBufferFull() { + return reads.size() > ReadShard.MAX_READS; + } + + /** + * Adds a read to the read buffer. + * @param read Add a read to the internal shard buffer. + */ + public void addRead(SAMRecord read) { + // DO NOT validate that the buffer is full. Paired read sharding will occasionally have to stuff another + // read or two into the buffer. + reads.add(read); + } + + /** + * Fills this shard's buffer with reads from the iterator passed in + * + * @param readIter Iterator from which to draw the reads to fill the shard + */ + @Override + public void fill( PeekableIterator readIter ) { + if( ! buffersReads() ) + throw new ReviewedGATKException("Attempting to fill a non-buffering shard."); + + SAMFileHeader.SortOrder sortOrder = getReadProperties().getSortOrder(); + SAMRecord read = null; + + while( ! isBufferFull() && readIter.hasNext() ) { + final SAMRecord nextRead = readIter.peek(); + if ( read == null || (nextRead.getReferenceIndex().equals(read.getReferenceIndex())) ) { + // only add reads to the shard if they are on the same contig + read = readIter.next(); + addRead(read); + } else { + break; + } + } + + // If the reads are sorted in coordinate order, ensure that all reads + // having the same alignment start become part of the same shard, to allow + // downsampling to work better across shard boundaries. Note that because our + // read stream has already been fed through the positional downsampler, which + // ensures that at each alignment start position there are no more than dcov + // reads, we're in no danger of accidentally creating a disproportionately huge + // shard + if ( sortOrder == SAMFileHeader.SortOrder.coordinate ) { + while ( readIter.hasNext() ) { + SAMRecord additionalRead = readIter.peek(); + + // Stop filling the shard as soon as we encounter a read having a different + // alignment start or contig from the last read added in the earlier loop + // above, or an unmapped read + if ( read == null || + additionalRead.getReadUnmappedFlag() || + ! additionalRead.getReferenceIndex().equals(read.getReferenceIndex()) || + additionalRead.getAlignmentStart() != read.getAlignmentStart() ) { + break; + } + + addRead(readIter.next()); + } + } + + // If the reads are sorted in queryname order, ensure that all reads + // having the same queryname become part of the same shard. + if( sortOrder == SAMFileHeader.SortOrder.queryname ) { + while( readIter.hasNext() ) { + SAMRecord nextRead = readIter.peek(); + if( read == null || ! read.getReadName().equals(nextRead.getReadName()) ) + break; + addRead(readIter.next()); + } + } + } + + /** + * Creates an iterator over reads stored in this shard's read cache. + * @return + */ + public GATKSAMIterator iterator() { + return GATKSAMIteratorAdapter.adapt(reads.iterator()); + } + + /** + * String representation of this shard. + * @return A string representation of the boundaries of this shard. + */ + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + for(Map.Entry entry: getFileSpans().entrySet()) { + sb.append(entry.getKey()); + sb.append(": "); + sb.append(entry.getValue()); + sb.append(' '); + } + return sb.toString(); + } + + /** + * Get the full span from the start of the left most read to the end of the right most one + * + * Note this may be different than the getLocation() of the shard, as this reflects the + * targeted span, not the actual span of reads + * + * @return the genome loc representing the span of these reads on the genome + */ + public GenomeLoc getReadsSpan() { + if ( isUnmapped() || super.getGenomeLocs() == null || reads.isEmpty() ) + return super.getLocation(); + else { + int start = Integer.MAX_VALUE; + int stop = Integer.MIN_VALUE; + String contig = null; + boolean foundMapped = false; + + for ( final SAMRecord read : reads ) { + if ( contig != null && ! read.getReferenceName().equals(contig) ) + throw new ReviewedGATKException("ReadShard contains reads spanning contig boundaries, which is no longer allowed. " + + "First contig is " + contig + " next read was " + read.getReferenceName() ); + contig = read.getReferenceName(); + + // Even if this shard as a *whole* is not "unmapped", we can still encounter *individual* unmapped mates + // of mapped reads within this shard's buffer. In fact, if we're very unlucky with shard boundaries, + // this shard might consist *only* of unmapped mates! We need to refrain from using the alignment + // starts/stops of these unmapped mates, and detect the case where the shard has been filled *only* + // with unmapped mates. + if ( ! read.getReadUnmappedFlag() ) { + foundMapped = true; + if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart(); + if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd(); + } + } + + assert contig != null; + + if ( ! foundMapped || contig.equals("*") ) // all reads are unmapped + return GenomeLoc.UNMAPPED; + else + return parser.createGenomeLoc(contig, start, stop); + } + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShardBalancer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShardBalancer.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShardBalancer.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShardBalancer.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java new file mode 100644 index 000000000..79b853e6b --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java @@ -0,0 +1,1251 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.MergingSamRecordIterator; +import htsjdk.samtools.SamFileHeaderMerger; +import htsjdk.samtools.*; +import htsjdk.samtools.util.CloseableIterator; +import htsjdk.samtools.util.CloserUtil; +import htsjdk.samtools.util.RuntimeIOException; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.engine.ReadMetrics; +import org.broadinstitute.gatk.engine.ReadProperties; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.engine.filters.CountingFilteringIterator; +import org.broadinstitute.gatk.engine.filters.ReadFilter; +import org.broadinstitute.gatk.engine.iterators.*; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.GenomeLocSortedSet; +import org.broadinstitute.gatk.utils.SimpleTimer; +import org.broadinstitute.gatk.engine.iterators.ReadTransformingIterator; +import org.broadinstitute.gatk.utils.downsampling.*; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIteratorAdapter; +import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecordIterator; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; + +import java.io.File; +import java.util.*; +import java.util.concurrent.Callable; + +/** + * User: aaron + * Date: Mar 26, 2009 + * Time: 2:36:16 PM + *

    + * Converts shards to SAM iterators over the specified region + */ +public class SAMDataSource { + /** Reference file */ + private final File referenceFile; + + /** Backing support for reads. */ + protected final ReadProperties readProperties; + + /** + * Runtime metrics of reads filtered, etc. + */ + private final ReadMetrics readMetrics; + + /** + * Tools for parsing GenomeLocs, for verifying BAM ordering against general ordering. + */ + protected final GenomeLocParser genomeLocParser; + + /** + * Identifiers for the readers driving this data source. + */ + private final Collection readerIDs; + + /** + * How strict are the readers driving this data source. + */ + private final ValidationStringency validationStringency; + + /** + * Do we want to remove the program records from this data source? + */ + private final boolean removeProgramRecords; + + /** + * Store BAM indices for each reader present. + */ + private final Map bamIndices = new HashMap(); + + /** + * The merged header. + */ + private final SAMFileHeader mergedHeader; + + /** + * The constituent headers of the unmerged files. + */ + private final Map headers = new HashMap(); + + /** + * The sort order of the BAM files. Files without a sort order tag are assumed to be + * in coordinate order. + */ + private SAMFileHeader.SortOrder sortOrder = null; + + /** + * Whether the read groups in overlapping files collide. + */ + private final boolean hasReadGroupCollisions; + + /** + * Maps the SAM readers' merged read group ids to their original ids. Since merged read group ids + * are always unique, we can simply use a map here, no need to stratify by reader. + */ + private final ReadGroupMapping mergedToOriginalReadGroupMappings = new ReadGroupMapping(); + + /** + * Maps the SAM readers' original read group ids to their revised ids. This mapping must be stratified + * by readers, since there can be readgroup id collision: different bam files (readers) can list the + * same read group id, which will be disambiguated when these input streams are merged. + */ + private final Map originalToMergedReadGroupMappings = new HashMap(); + + /** + * Mapping from input file path to new sample name. Used only when doing on-the-fly sample renaming. + */ + private Map sampleRenameMap = null; + + /** our log, which we want to capture anything from this class */ + private static Logger logger = Logger.getLogger(SAMDataSource.class); + + /** + * A collection of readers driving the merging process. + */ + private final SAMResourcePool resourcePool; + + /** + * Asynchronously loads BGZF blocks. + */ + private final BGZFBlockLoadingDispatcher dispatcher; + + /** + * How are threads allocated. + */ + private final ThreadAllocation threadAllocation; + + /** + * How are adjacent intervals merged by the sharder? + */ + private final IntervalMergingRule intervalMergingRule; + + /** + * Static set of unsupported programs that create bam files. + * The key is the PG record ID and the value is the name of the tool that created it + */ + private static Map unsupportedPGs = new HashMap<>(); + static { + unsupportedPGs.put("GATK ReduceReads", "ReduceReads"); + } + + /** + * Create a new SAM data source given the supplied read metadata. + * + * For testing purposes + * + * @param samFiles list of reads files. + */ + public SAMDataSource(final File referenceFile, final Collection samFiles, + final ThreadAllocation threadAllocation, final Integer numFileHandles, + final GenomeLocParser genomeLocParser) { + this( + referenceFile, + samFiles, + threadAllocation, + numFileHandles, + genomeLocParser, + false, + ValidationStringency.STRICT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + false); + } + + /** + * See complete constructor. Does not enable BAQ by default. + * + * For testing purposes + */ + public SAMDataSource( + final File referenceFile, + Collection samFiles, + ThreadAllocation threadAllocation, + Integer numFileHandles, + GenomeLocParser genomeLocParser, + boolean useOriginalBaseQualities, + ValidationStringency strictness, + Integer readBufferSize, + DownsamplingMethod downsamplingMethod, + ValidationExclusion exclusionList, + Collection supplementalFilters, + boolean includeReadsWithDeletionAtLoci) { + this( referenceFile, + samFiles, + threadAllocation, + numFileHandles, + genomeLocParser, + useOriginalBaseQualities, + strictness, + readBufferSize, + downsamplingMethod, + exclusionList, + supplementalFilters, + Collections.emptyList(), + includeReadsWithDeletionAtLoci, + (byte) -1, + false, + false, + null, + IntervalMergingRule.ALL); + } + + /** + * Create a new SAM data source given the supplied read metadata. + * @param referenceFile reference file. + * @param samFiles list of reads files. + * @param useOriginalBaseQualities True if original base qualities should be used. + * @param strictness Stringency of reads file parsing. + * @param readBufferSize Number of reads to hold in memory per BAM. + * @param downsamplingMethod Method for downsampling reads at a given locus. + * @param exclusionList what safety checks we're willing to let slide + * @param supplementalFilters additional filters to dynamically apply. + * @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method + * will explicitly list reads with deletion over the current reference base; otherwise, only observed + * bases will be seen in the pileups, and the deletions will be skipped silently. + * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. + * @param keepReadsInLIBS should we keep a unique list of reads in LIBS? + * @param sampleRenameMap Map of BAM file to new sample ID used during on-the-fly runtime sample renaming. + * Will be null if we're not doing sample renaming. + * @param intervalMergingRule how are adjacent intervals merged by the sharder + */ + public SAMDataSource( + final File referenceFile, + Collection samFiles, + ThreadAllocation threadAllocation, + Integer numFileHandles, + GenomeLocParser genomeLocParser, + boolean useOriginalBaseQualities, + ValidationStringency strictness, + Integer readBufferSize, + DownsamplingMethod downsamplingMethod, + ValidationExclusion exclusionList, + Collection supplementalFilters, + List readTransformers, + boolean includeReadsWithDeletionAtLoci, + byte defaultBaseQualities, + boolean removeProgramRecords, + final boolean keepReadsInLIBS, + final Map sampleRenameMap, + final IntervalMergingRule intervalMergingRule) { + + this.referenceFile = referenceFile; + this.readMetrics = new ReadMetrics(); + this.genomeLocParser = genomeLocParser; + this.intervalMergingRule = intervalMergingRule; + + readerIDs = samFiles; + + this.threadAllocation = threadAllocation; + // TODO: Consider a borrowed-thread dispatcher implementation. + if(this.threadAllocation.getNumIOThreads() > 0) { + logger.info("Running in asynchronous I/O mode; number of threads = " + this.threadAllocation.getNumIOThreads()); + dispatcher = new BGZFBlockLoadingDispatcher(this.threadAllocation.getNumIOThreads(), numFileHandles != null ? numFileHandles : 1); + } + else + dispatcher = null; + + validationStringency = strictness; + this.removeProgramRecords = removeProgramRecords; + if(readBufferSize != null) + ReadShard.setReadBufferSize(readBufferSize); // TODO: use of non-final static variable here is just awful, especially for parallel tests + else { + // Choose a sensible default for the read buffer size. + // Previously we we're picked 100000 reads per BAM per shard with a max cap of 250K reads in memory at once. + // Now we are simply setting it to 100K reads + ReadShard.setReadBufferSize(100000); + } + + this.sampleRenameMap = sampleRenameMap; + + resourcePool = new SAMResourcePool(Integer.MAX_VALUE); + SAMReaders readers = resourcePool.getAvailableReaders(); + + // Determine the sort order. + for(SAMReaderID readerID: readerIDs) { + if (! readerID.getSamFile().canRead() ) + throw new UserException.CouldNotReadInputFile(readerID.getSamFile(),"file is not present or user does not have appropriate permissions. " + + "Please check that the file is present and readable and try again."); + + // Get the sort order, forcing it to coordinate if unsorted. + SamReader reader = readers.getReader(readerID); + SAMFileHeader header = reader.getFileHeader(); + + headers.put(readerID,header); + + if ( header.getReadGroups().isEmpty() ) { + throw new UserException.MalformedBAM(readers.getReaderID(reader).getSamFile(), + "SAM file doesn't have any read groups defined in the header. The GATK no longer supports SAM files without read groups"); + } + + SAMFileHeader.SortOrder sortOrder = header.getSortOrder() != SAMFileHeader.SortOrder.unsorted ? header.getSortOrder() : SAMFileHeader.SortOrder.coordinate; + + // Validate that all input files are sorted in the same order. + if(this.sortOrder != null && this.sortOrder != sortOrder) + throw new UserException.MissortedBAM(String.format("Attempted to process mixed of files sorted as %s and %s.",this.sortOrder,sortOrder)); + + // Update the sort order. + this.sortOrder = sortOrder; + } + + mergedHeader = readers.getMergedHeader(); + hasReadGroupCollisions = readers.hasReadGroupCollisions(); + + readProperties = new ReadProperties( + samFiles, + mergedHeader, + sortOrder, + useOriginalBaseQualities, + strictness, + downsamplingMethod, + exclusionList, + supplementalFilters, + readTransformers, + includeReadsWithDeletionAtLoci, + defaultBaseQualities, + keepReadsInLIBS); + + // cache the read group id (original) -> read group id (merged) + // and read group id (merged) -> read group id (original) mappings. + for(SAMReaderID id: readerIDs) { + SamReader reader = readers.getReader(id); + + ReadGroupMapping mappingToMerged = new ReadGroupMapping(); + + List readGroups = reader.getFileHeader().getReadGroups(); + for(SAMReadGroupRecord readGroup: readGroups) { + if(hasReadGroupCollisions) { + mappingToMerged.put(readGroup.getReadGroupId(),readers.getReadGroupId(id,readGroup.getReadGroupId())); + mergedToOriginalReadGroupMappings.put(readers.getReadGroupId(id,readGroup.getReadGroupId()),readGroup.getReadGroupId()); + } else { + mappingToMerged.put(readGroup.getReadGroupId(),readGroup.getReadGroupId()); + mergedToOriginalReadGroupMappings.put(readGroup.getReadGroupId(),readGroup.getReadGroupId()); + } + } + + originalToMergedReadGroupMappings.put(id,mappingToMerged); + } + + for(SAMReaderID id: readerIDs) { + File indexFile = findIndexFile(id.getSamFile()); + if(indexFile != null) + bamIndices.put(id,new GATKBAMIndex(indexFile)); + } + + resourcePool.releaseReaders(readers); + } + + /** + * Checks whether the provided SAM header if from a reduced bam file. + * @param header the SAM header for a given file + * @throws UserException if the header is from a reduced bam + */ + private void checkForUnsupportedBamFile(final SAMFileHeader header) { + for ( final SAMProgramRecord PGrecord : header.getProgramRecords() ) { + if ( unsupportedPGs.containsKey(PGrecord.getId()) ) + throw new UserException("The GATK no longer supports running off of BAMs produced by " + unsupportedPGs.get(PGrecord.getId())); + } + } + + public void close() { + SAMReaders readers = resourcePool.getAvailableReaders(); + for(SAMReaderID readerID: readerIDs) { + SamReader reader = readers.getReader(readerID); + CloserUtil.close(reader); + } + } + + /** + * Returns Reads data structure containing information about the reads data sources placed in this pool as well as + * information about how they are downsampled, sorted, and filtered + * @return + */ + public ReadProperties getReadsInfo() { return readProperties; } + + /** + * Checks to see whether any reads files are supplying data. + * @return True if no reads files are supplying data to the traversal; false otherwise. + */ + public boolean isEmpty() { + return readProperties.getSAMReaderIDs().size() == 0; + } + + /** + * Gets the SAM file associated with a given reader ID. + * @param id The reader for which to retrieve the source file. + * @return the file actually associated with the id. + */ + public File getSAMFile(SAMReaderID id) { + return id.getSamFile(); + } + + /** + * Returns readers used by this data source. + * @return A list of SAM reader IDs. + */ + public Collection getReaderIDs() { + return readerIDs; + } + + /** + * Retrieves the id of the reader which built the given read. + * @param read The read to test. + * @return ID of the reader. + */ + public SAMReaderID getReaderID(SAMRecord read) { + return resourcePool.getReaderID(read.getFileSource().getReader()); + } + + /** + * Gets the merged header from the SAM file. + * @return The merged header. + */ + public SAMFileHeader getHeader() { + return mergedHeader; + } + + public SAMFileHeader getHeader(SAMReaderID id) { + return headers.get(id); + } + + /** + * Gets the revised read group id mapped to this 'original' read group id. + * @param reader for which to grab a read group. + * @param originalReadGroupId ID of the original read group. + * @return Merged read group ID. + */ + public String getReadGroupId(final SAMReaderID reader, final String originalReadGroupId) { + return originalToMergedReadGroupMappings.get(reader).get(originalReadGroupId); + } + + /** + * Gets the original read group id (as it was specified in the original input bam file) that maps onto + * this 'merged' read group id. + * @param mergedReadGroupId 'merged' ID of the read group (as it is presented by the read received from merged input stream). + * @return Merged read group ID. + */ + public String getOriginalReadGroupId(final String mergedReadGroupId) { + return mergedToOriginalReadGroupMappings.get(mergedReadGroupId); + } + + /** + * Gets the index for a particular reader. Always preloaded. + * @param id Id of the reader. + * @return The index. Will preload the index if necessary. + */ + public GATKBAMIndex getIndex(final SAMReaderID id) { + return bamIndices.get(id); + } + + /** + * Return true if the index for a particular reader exists. + * @param id Id of the reader. + * @return True if the index exists. + */ + public boolean hasIndex(final SAMReaderID id) { + return bamIndices.containsKey(id); + } + + /** + * True if all readers that require an index for SAMFileSpan creation have an index. + * @return True if all readers that require an index for SAMFileSpan creation have an index. + */ + public boolean hasIndex() { + for (final SAMReaderID readerID: readerIDs) + if (isSAMFileSpanSupported(readerID)) + if (!hasIndex(readerID)) + return false; + return true; + } + /** + * Returns true if the reader can use file spans. + * @return true if file spans are supported. + */ + private boolean isSAMFileSpanSupported(final SAMReaderID readerID) { + // example: https://github.com/samtools/htsjdk/blob/ee4308ede60962f3ab4275473ac384724b471149/src/java/htsjdk/samtools/BAMFileReader.java#L341 + return readerID.getSamFile().getName().toLowerCase().endsWith(SamReader.Type.BAM_TYPE.fileExtension()); + } + + /** + * Returns true if the reader caches its SAMFileHeader for each iterator. + * @return true if this reader caches its SAMFileHeader for each iterator. + */ + private boolean isIteratorSAMFileHeaderCached(final SAMReaderID readerID) { + // example: https://github.com/samtools/htsjdk/blob/ee4308ede60962f3ab4275473ac384724b471149/src/java/htsjdk/samtools/CRAMFileReader.java#L183 + return !readerID.getSamFile().getName().toLowerCase().endsWith(SamReader.Type.CRAM_TYPE.fileExtension()); + } + + /** + * Retrieves the sort order of the readers. + * @return Sort order. Can be unsorted, coordinate order, or query name order. + */ + public SAMFileHeader.SortOrder getSortOrder() { + return sortOrder; + } + + /** + * Gets the cumulative read metrics for shards already processed. + * @return Cumulative read metrics. + */ + public ReadMetrics getCumulativeReadMetrics() { + // don't return a clone here because the engine uses a pointer to this object + return readMetrics; + } + + /** + * Incorporate the given read metrics into the cumulative read metrics. + * @param readMetrics The 'incremental' read metrics, to be incorporated into the cumulative metrics. + */ + public void incorporateReadMetrics(final ReadMetrics readMetrics) { + this.readMetrics.incrementMetrics(readMetrics); + } + + public GATKSAMIterator seek(Shard shard) { + if(shard.buffersReads()) { + return shard.iterator(); + } + else { + return getIterator(shard); + } + } + + /** + * Gets the reader associated with the given read. + * @param readers Available readers. + * @param read + * @return + */ + private SAMReaderID getReaderID(SAMReaders readers, SAMRecord read) { + for(SAMReaderID id: getReaderIDs()) { + if(readers.getReader(id) == read.getFileSource().getReader()) + return id; + } + throw new ReviewedGATKException("Unable to find id for reader associated with read " + read.getReadName()); + } + + /** + * Get the initial reader positions across all BAM files + * + * @return the start positions of the first chunk of reads for all BAM files + */ + protected Map getInitialReaderPositions() { + Map initialPositions = new HashMap(); + SAMReaders readers = resourcePool.getAvailableReaders(); + + for ( SAMReaderID id: getReaderIDs() ) { + GATKBAMFileSpan span; + try { + span = new GATKBAMFileSpan(readers.getReader(id).indexing().getFilePointerSpanningReads()); + } catch (RuntimeException e) { + if ("Not implemented.".equals(e.getMessage())) { https://github.com/samtools/htsjdk/blob/035d4319643657d715e93c53c13fe4a1f64e0188/src/java/htsjdk/samtools/CRAMFileReader.java#L197 + span = new GATKBAMFileSpan(new GATKChunk(0, Long.MAX_VALUE)); + } else { + throw e; + } + } + initialPositions.put(id, span); + } + + resourcePool.releaseReaders(readers); + return initialPositions; + } + + /** + * Get an iterator over the data types specified in the shard. + * + * @param shard The shard specifying the data limits. + * @return An iterator over the selected data. + */ + protected GATKSAMIterator getIterator( Shard shard ) { + return getIterator(resourcePool.getAvailableReaders(), shard, shard instanceof ReadShard); + } + + /** + * Get an iterator over the data types specified in the shard. + * @param readers Readers from which to load data. + * @param shard The shard specifying the data limits. + * @param enableVerification True to verify. For compatibility with old sharding strategy. + * @return An iterator over the selected data. + */ + private GATKSAMIterator getIterator(SAMReaders readers, Shard shard, boolean enableVerification) { + // Set up merging to dynamically merge together multiple BAMs. + Map> iteratorMap = new HashMap<>(); + + for(SAMReaderID id: getReaderIDs()) { + CloseableIterator iterator; + + // TODO: null used to be the signal for unmapped, but we've replaced that with a simple index query for the last bin. + // TODO: Kill this check once we've proven that the design elements are gone. + if(shard.getFileSpans().get(id) == null) + throw new ReviewedGATKException("SAMDataSource: received null location for reader " + id + ", but null locations are no longer supported."); + + try { + if(threadAllocation.getNumIOThreads() > 0) { + // TODO: need to add friendly error if -nit is used with non BAM. Later, possibly add this capability with CRAM when htsjdk supports CRAM file spans are supported. + BlockInputStream inputStream = readers.getInputStream(id); + inputStream.submitAccessPlan(new BAMAccessPlan(id, inputStream, (GATKBAMFileSpan) shard.getFileSpans().get(id))); + BAMRecordCodec codec = new BAMRecordCodec(getHeader(id)); + codec.setInputStream(inputStream); + iterator = new BAMCodecIterator(inputStream,readers.getReader(id),codec); + } + else { + final SamReader reader = readers.getReader(id); + try { + iterator = ((SamReader.Indexing)reader).iterator(shard.getFileSpans().get(id)); + } catch (RuntimeException re) { + if ("Not implemented.".equals(re.getMessage())) { // https://github.com/samtools/htsjdk/blob/429f2a8585d9c98a3efd4cedc5188b60b1e66ac5/src/java/htsjdk/samtools/CRAMFileReader.java#L192 + // No way to jump into the file span. Query the whole file. + iterator = readers.getReader(id).iterator(); + } else { + throw re; + } + } + } + } catch ( RuntimeException e ) { // we need to catch RuntimeExceptions here because the Picard code is throwing them (among SAMFormatExceptions) sometimes + throw new UserException.MalformedBAM(id.getSamFile(), e.getMessage()); + } + + // At the moment, too many other classes to change for GATKSAMRecordIterator converter. + // Force the compiler to just let the conversion happen, since generics are erased anyway. + iterator = (CloseableIterator)(Object)new GATKSAMRecordIterator(iterator); + iterator = new MalformedBAMErrorReformatingIterator(id.getSamFile(), iterator); + if(shard.getGenomeLocs().size() > 0) + iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs()); + + iteratorMap.put(readers.getReader(id), iterator); + } + + MergingSamRecordIterator mergingIterator = readers.createMergingIterator(iteratorMap); + + // The readMetrics object being passed in should be that of this dataSource and NOT the shard: the dataSource's + // metrics is intended to keep track of the reads seen (and hence passed to the CountingFilteringIterator when + // we apply the decorators), whereas the shard's metrics is used to keep track the "records" seen. + return applyDecoratingIterators(readMetrics, + enableVerification, + readProperties.useOriginalBaseQualities(), + new ReleasingIterator(readers, GATKSAMIteratorAdapter.adapt(mergingIterator)), + readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), + readProperties.getSupplementalFilters(), + readProperties.getReadTransformers(), + readProperties.defaultBaseQualities(), + shard instanceof LocusShard); + } + + private class BAMCodecIterator implements CloseableIterator { + private final BlockInputStream inputStream; + private final SamReader reader; + private final BAMRecordCodec codec; + private SAMRecord nextRead; + + private BAMCodecIterator(final BlockInputStream inputStream, final SamReader reader, final BAMRecordCodec codec) { + this.inputStream = inputStream; + this.reader = reader; + this.codec = codec; + advance(); + } + + public boolean hasNext() { + return nextRead != null; + } + + public SAMRecord next() { + if(!hasNext()) + throw new NoSuchElementException("Unable to retrieve next record from BAMCodecIterator; input stream is empty"); + SAMRecord currentRead = nextRead; + advance(); + return currentRead; + } + + public void close() { + // NO-OP. + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove from BAMCodecIterator"); + } + + private void advance() { + final long startCoordinate = inputStream.getFilePointer(); + nextRead = codec.decode(); + final long stopCoordinate = inputStream.getFilePointer(); + + if(reader != null && nextRead != null) + PicardNamespaceUtils.setFileSource(nextRead, new SAMFileSource(reader, new GATKBAMFileSpan(new GATKChunk(startCoordinate, stopCoordinate)))); + } + } + + /** + * Filter reads based on user-specified criteria. + * + * @param readMetrics metrics to track when using this iterator. + * @param enableVerification Verify the order of reads. + * @param useOriginalBaseQualities True if original base qualities should be used. + * @param wrappedIterator the raw data source. + * @param noValidationOfReadOrder Another trigger for the verifying iterator? TODO: look into this. + * @param supplementalFilters additional filters to apply to the reads. + * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. + * @param isLocusBasedTraversal true if we're dealing with a read stream from a LocusShard + * @return An iterator wrapped with filters reflecting the passed-in parameters. Will not be null. + */ + protected GATKSAMIterator applyDecoratingIterators(ReadMetrics readMetrics, + boolean enableVerification, + boolean useOriginalBaseQualities, + GATKSAMIterator wrappedIterator, + Boolean noValidationOfReadOrder, + Collection supplementalFilters, + List readTransformers, + byte defaultBaseQualities, + boolean isLocusBasedTraversal ) { + + // Always apply the ReadFormattingIterator before both ReadFilters and ReadTransformers. At a minimum, + // this will consolidate the cigar strings into canonical form. This has to be done before the read + // filtering, because not all read filters will behave correctly with things like zero-length cigar + // elements. If useOriginalBaseQualities is true or defaultBaseQualities >= 0, this iterator will also + // modify the base qualities. + wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities); + + // Read Filters: these are applied BEFORE downsampling, so that we downsample within the set of reads + // that actually survive filtering. Otherwise we could get much less coverage than requested. + wrappedIterator = GATKSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters)); + + // Downsampling: + + // For locus traversals where we're downsampling to coverage by sample, assume that the downsamplers + // will be invoked downstream from us in LocusIteratorByState. This improves performance by avoiding + // splitting/re-assembly of the read stream at this stage, and also allows for partial downsampling + // of individual reads. + boolean assumeDownstreamLIBSDownsampling = isLocusBasedTraversal && + readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && + readProperties.getDownsamplingMethod().toCoverage != null; + + // Apply downsampling iterators here only in cases where we know that LocusIteratorByState won't be + // doing any downsampling downstream of us + if ( ! assumeDownstreamLIBSDownsampling ) { + wrappedIterator = applyDownsamplingIterator(wrappedIterator); + } + + // unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification, + // verify the read ordering by applying a sort order iterator + if (!noValidationOfReadOrder && enableVerification) + wrappedIterator = new VerifyingSamIterator(wrappedIterator); + + // Read transformers: these are applied last, so that we don't bother transforming reads that get discarded + // by the read filters or downsampler. + for ( final ReadTransformer readTransformer : readTransformers ) { + if ( readTransformer.enabled() && readTransformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_INPUT ) + wrappedIterator = new ReadTransformingIterator(wrappedIterator, readTransformer); + } + + return wrappedIterator; + } + + protected GATKSAMIterator applyDownsamplingIterator( GATKSAMIterator wrappedIterator ) { + if ( readProperties.getDownsamplingMethod() == null || + readProperties.getDownsamplingMethod().type == DownsampleType.NONE ) { + return wrappedIterator; + } + + if ( readProperties.getDownsamplingMethod().toFraction != null ) { + + // If we're downsampling to a fraction of reads, there's no point in paying the cost of + // splitting/re-assembling the read stream by sample to run the FractionalDownsampler on + // reads from each sample separately, since the result would be the same as running the + // FractionalDownsampler on the entire stream. So, ALWAYS use the DownsamplingReadsIterator + // rather than the PerSampleDownsamplingReadsIterator, even if BY_SAMPLE downsampling + // was requested. + + return new DownsamplingReadsIterator(wrappedIterator, + new FractionalDownsampler(readProperties.getDownsamplingMethod().toFraction)); + } + else if ( readProperties.getDownsamplingMethod().toCoverage != null ) { + + // If we're downsampling to coverage, we DO need to pay the cost of splitting/re-assembling + // the read stream to run the downsampler on the reads for each individual sample separately if + // BY_SAMPLE downsampling was requested. + + if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) { + return new PerSampleDownsamplingReadsIterator(wrappedIterator, + new SimplePositionalDownsamplerFactory(readProperties.getDownsamplingMethod().toCoverage)); + } + else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) { + return new DownsamplingReadsIterator(wrappedIterator, + new SimplePositionalDownsampler(readProperties.getDownsamplingMethod().toCoverage)); + } + } + + return wrappedIterator; + } + + + private class SAMResourcePool { + /** + * How many entries can be cached in this resource pool? + */ + private final int maxEntries; + + /** + * All iterators of this reference-ordered data. + */ + private List allResources = new ArrayList(); + + /** + * All iterators that are not currently in service. + */ + private List availableResources = new ArrayList(); + + public SAMResourcePool(final int maxEntries) { + this.maxEntries = maxEntries; + } + + /** + * Choose a set of readers from the pool to use for this query. When complete, + * @return + */ + public synchronized SAMReaders getAvailableReaders() { + if(availableResources.size() == 0) + createNewResource(); + SAMReaders readers = availableResources.get(0); + availableResources.remove(readers); + return readers; + } + + public synchronized void releaseReaders(SAMReaders readers) { + if(!allResources.contains(readers)) + throw new ReviewedGATKException("Tried to return readers from the pool that didn't originate in the pool."); + availableResources.add(readers); + } + + /** + * Gets the reader id for the given reader. + * @param reader Reader for which to determine the id. + * @return id of the given reader. + */ + protected synchronized SAMReaderID getReaderID(SamReader reader) { + for(SAMReaders readers: allResources) { + SAMReaderID id = readers.getReaderID(reader); + if(id != null) + return id; + } + throw new ReviewedGATKException("No such reader id is available"); + } + + private synchronized void createNewResource() { + if(allResources.size() > maxEntries) + throw new ReviewedGATKException("Cannot create a new resource pool. All resources are in use."); + SAMReaders readers = new SAMReaders(readerIDs, validationStringency, removeProgramRecords); + allResources.add(readers); + availableResources.add(readers); + } + + } + + /** + * A collection of readers derived from a reads metadata structure. + */ + private class SAMReaders implements Iterable { + /** + * Cached representation of the merged header used to generate a merging iterator. + */ + private final SamFileHeaderMerger headerMerger; + + /** + * Internal storage for a map of id -> reader. + */ + private final Map readers = new LinkedHashMap<>(); + + /** + * The inptu streams backing + */ + private final Map inputStreams = new LinkedHashMap(); + + /** + * Derive a new set of readers from the Reads metadata. + * @param readerIDs reads to load. + * TODO: validationStringency is not used here + * @param validationStringency validation stringency. + * @param removeProgramRecords indicate whether to clear program records from the readers + */ + public SAMReaders(Collection readerIDs, ValidationStringency validationStringency, boolean removeProgramRecords) { + final int totalNumberOfFiles = readerIDs.size(); + int readerNumber = 1; + final SimpleTimer timer = new SimpleTimer().start(); + + if ( totalNumberOfFiles > 0 ) logger.info("Initializing SAMRecords in serial"); + final int tickSize = 50; + int nExecutedTotal = 0; + long lastTick = timer.currentTime(); + for(final SAMReaderID readerID: readerIDs) { + final ReaderInitializer init = new ReaderInitializer(readerID).call(); + + checkForUnsupportedBamFile(init.reader.getFileHeader()); + + if (removeProgramRecords && isIteratorSAMFileHeaderCached(readerID)) { + // Only works when the SamReader implementation caches its header. + // Some implementations (ex: CRAM) rewrite the new underlying file header in reader.getIterator(). + // Later, when MergingSamRecordIterator goes to check the headers with .contains()/.equals(), + // it will error out complaining it can't find the unmodified version of the header. + init.reader.getFileHeader().setProgramRecords(new ArrayList()); + } + + if (threadAllocation.getNumIOThreads() > 0) { + inputStreams.put(init.readerID, init.blockInputStream); // get from initializer + } + + logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, readerID.getSamFile())); + readers.put(init.readerID,init.reader); + if ( ++nExecutedTotal % tickSize == 0) { + double tickInSec = (timer.currentTime() - lastTick) / 1000.0; + printReaderPerformance(nExecutedTotal, tickSize, totalNumberOfFiles, timer, tickInSec); + lastTick = timer.currentTime(); + } + } + + if ( totalNumberOfFiles > 0 ) logger.info(String.format("Done initializing BAM readers: total time %.2f", timer.getElapsedTime())); + + Collection headers = new LinkedList(); + + // Examine the bam headers, perform any requested sample renaming on them, and add + // them to the list of headers to pass to the Picard SamFileHeaderMerger: + for ( final Map.Entry readerEntry : readers.entrySet() ) { + final SAMReaderID readerID = readerEntry.getKey(); + final SamReader reader = readerEntry.getValue(); + final SAMFileHeader header = reader.getFileHeader(); + + // The remappedSampleName will be null if either no on-the-fly sample renaming was requested, + // or the user's sample rename map file didn't contain an entry for this bam file: + final String remappedSampleName = sampleRenameMap != null ? sampleRenameMap.get(readerID.getSamFilePath()) : null; + + // If we've been asked to rename the sample for this bam file, do so now. We'll check to + // make sure this bam only contains reads from one sample before proceeding. + // + // IMPORTANT: relies on the fact that the Picard SamFileHeaderMerger makes a copy of + // the existing read group attributes (including sample name) when merging + // headers, regardless of whether there are read group collisions or not. + if ( remappedSampleName != null ) { + remapSampleName(readerID, header, remappedSampleName); + } + + headers.add(header); + } + + headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,headers,true); + + // update all read groups to GATKSAMRecordReadGroups + final List gatkReadGroups = new LinkedList(); + for ( final SAMReadGroupRecord rg : headerMerger.getMergedHeader().getReadGroups() ) { + gatkReadGroups.add(new GATKSAMReadGroupRecord(rg)); + } + headerMerger.getMergedHeader().setReadGroups(gatkReadGroups); + } + + /** + * Changes the sample name in the read groups for the provided bam file header to match the + * remappedSampleName. Blows up with a UserException if the header contains more than one + * sample name. + * + * @param readerID ID for the bam file from which the provided header came from + * @param header The bam file header. Will be modified by this call. + * @param remappedSampleName New sample name to replace the existing sample attribute in the + * read groups for the header. + */ + private void remapSampleName( final SAMReaderID readerID, final SAMFileHeader header, final String remappedSampleName ) { + String firstEncounteredSample = null; + + for ( final SAMReadGroupRecord readGroup : header.getReadGroups() ) { + final String thisReadGroupSample = readGroup.getSample(); + + if ( thisReadGroupSample == null ) { + throw new UserException(String.format("On-the fly sample renaming was requested for bam file %s, however this " + + "bam file contains a read group (id: %s) with a null sample attribute", + readerID.getSamFilePath(), readGroup.getId())); + } + else if ( firstEncounteredSample == null ) { + firstEncounteredSample = thisReadGroupSample; + } + else if ( ! firstEncounteredSample.equals(thisReadGroupSample) ) { + throw new UserException(String.format("On-the-fly sample renaming was requested for bam file %s, " + + "however this bam file contains reads from more than one sample " + + "(encountered samples %s and %s in the bam header). The GATK requires that " + + "all bams for which on-the-fly sample renaming is requested " + + "contain reads from only a single sample per bam.", + readerID.getSamFilePath(), firstEncounteredSample, thisReadGroupSample)); + } + + readGroup.setSample(remappedSampleName); + } + } + + final private void printReaderPerformance(final int nExecutedTotal, + final int nExecutedInTick, + final int totalNumberOfFiles, + final SimpleTimer timer, + final double tickDurationInSec) { + final int pendingSize = totalNumberOfFiles - nExecutedTotal; + final double totalTimeInSeconds = timer.getElapsedTime(); + final double nTasksPerSecond = nExecutedTotal / (1.0*totalTimeInSeconds); + final int nRemaining = pendingSize; + final double estTimeToComplete = pendingSize / nTasksPerSecond; + logger.info(String.format("Init %d BAMs in last %.2f s, %d of %d in %.2f s / %.2f m (%.2f tasks/s). %d remaining with est. completion in %.2f s / %.2f m", + nExecutedInTick, tickDurationInSec, + nExecutedTotal, totalNumberOfFiles, totalTimeInSeconds, totalTimeInSeconds / 60, nTasksPerSecond, + nRemaining, estTimeToComplete, estTimeToComplete / 60)); + } + + /** + * Return the header derived from the merging of these BAM files. + * @return the merged header. + */ + public SAMFileHeader getMergedHeader() { + return headerMerger.getMergedHeader(); + } + + /** + * Do multiple read groups collide in this dataset? + * @return True if multiple read groups collide; false otherwis. + */ + public boolean hasReadGroupCollisions() { + return headerMerger.hasReadGroupCollisions(); + } + + /** + * Get the newly mapped read group ID for the given read group. + * @param readerID Reader for which to discern the transformed ID. + * @param originalReadGroupID Original read group. + * @return Remapped read group. + */ + public String getReadGroupId(final SAMReaderID readerID, final String originalReadGroupID) { + SAMFileHeader header = readers.get(readerID).getFileHeader(); + return headerMerger.getReadGroupId(header,originalReadGroupID); + } + + /** + * Creates a new merging iterator from the given map, with the given header. + * @param iteratorMap A map of readers to iterators. + * @return An iterator which will merge those individual iterators. + */ + public MergingSamRecordIterator createMergingIterator(final Map> iteratorMap) { + return new MergingSamRecordIterator(headerMerger,iteratorMap,true); + } + + /** + * Retrieve the reader from the data structure. + * @param id The ID of the reader to retrieve. + * @return the reader associated with the given id. + */ + public SamReader getReader(SAMReaderID id) { + if(!readers.containsKey(id)) + throw new NoSuchElementException("No reader is associated with id " + id); + return readers.get(id); + } + + /** + * Retrieve the input stream backing a reader. + * @param id The ID of the reader to retrieve. + * @return the reader associated with the given id. + */ + public BlockInputStream getInputStream(final SAMReaderID id) { + return inputStreams.get(id); + } + + /** + * Searches for the reader id of this reader. + * @param reader Reader for which to search. + * @return The id associated the given reader, or null if the reader is not present in this collection. + */ + protected SAMReaderID getReaderID(SamReader reader) { + for(Map.Entry entry: readers.entrySet()) { + if(reader == entry.getValue()) + return entry.getKey(); + } + // Not found? return null. + return null; + } + + /** + * Returns an iterator over all readers in this structure. + * @return An iterator over readers. + */ + public Iterator iterator() { + return readers.values().iterator(); + } + + /** + * Returns whether any readers are present in this structure. + * @return + */ + public boolean isEmpty() { + return readers.isEmpty(); + } + } + + class ReaderInitializer implements Callable { + final SAMReaderID readerID; + BlockInputStream blockInputStream = null; + SamReader reader; + + public ReaderInitializer(final SAMReaderID readerID) { + this.readerID = readerID; + } + + public ReaderInitializer call() { + try { + if (threadAllocation.getNumIOThreads() > 0) + blockInputStream = new BlockInputStream(dispatcher,readerID,false); + reader = SamReaderFactory.makeDefault() + .referenceSequence(referenceFile) + .validationStringency(validationStringency) + .setOption(SamReaderFactory.Option.EAGERLY_DECODE, false) + .setOption(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, true) + .open(readerID.getSamFile()); + + } catch ( RuntimeIOException e ) { + throw new UserException.CouldNotReadInputFile(readerID.getSamFile(), e); + } catch ( SAMFormatException e ) { + throw new UserException.MalformedBAM(readerID.getSamFile(), e.getMessage()); + } + // Picard is throwing a RuntimeException here when BAMs are malformed with bad headers (and so look like SAM files). + // Let's keep this separate from the SAMFormatException (which ultimately derives from RuntimeException) case, + // just in case we want to change this behavior later. + catch ( RuntimeException e ) { + throw new UserException.MalformedBAM(readerID.getSamFile(), e.getMessage()); + } + return this; + } + } + + private class ReleasingIterator implements GATKSAMIterator { + /** + * The resource acting as the source of the data. + */ + private final SAMReaders resource; + + /** + * The iterator to wrap. + */ + private final GATKSAMIterator wrappedIterator; + + public ReleasingIterator(SAMReaders resource, GATKSAMIterator wrapped) { + this.resource = resource; + this.wrappedIterator = wrapped; + } + + public ReleasingIterator iterator() { + return this; + } + + public void remove() { + throw new UnsupportedOperationException("Can't remove from a GATKSAMIterator"); + } + + public void close() { + wrappedIterator.close(); + resourcePool.releaseReaders(resource); + } + + public boolean hasNext() { + return wrappedIterator.hasNext(); + } + + public SAMRecord next() { + return wrappedIterator.next(); + } + } + + /** + * Maps read groups in the original SAMFileReaders to read groups in + */ + private class ReadGroupMapping extends HashMap {} + + /** + * Locates the index file alongside the given BAM, if present. + * @param bamFile The data file to use. + * @return A File object if the index file is present; null otherwise. + */ + private File findIndexFile(File bamFile) { + return SamFiles.findIndex(bamFile); + } + + /** + * Creates a BAM schedule over all reads in the BAM file, both mapped and unmapped. The outgoing stream + * will be as granular as possible given our current knowledge of the best ways to split up BAM files. + * @return An iterator that spans all reads in all BAM files. + */ + public Iterable createShardIteratorOverAllReads(final ShardBalancer shardBalancer) { + shardBalancer.initialize(this,IntervalSharder.shardOverAllReads(this,genomeLocParser),genomeLocParser); + return shardBalancer; + } + + /** + * Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any + * read that has been assigned + * + * @param shardBalancer shard balancer object + * @return non-null initialized version of the shard balancer + */ + public Iterable createShardIteratorOverMappedReads(final ShardBalancer shardBalancer) { + shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,genomeLocParser),genomeLocParser); + return shardBalancer; + } + + /** + * Create a schedule for processing the initialized BAM file using the given interval list. + * The returned schedule should be as granular as possible. + * @param intervals The list of intervals for which to create the schedule. + * @return A granular iterator over file pointers. + */ + public Iterable createShardIteratorOverIntervals(final GenomeLocSortedSet intervals,final ShardBalancer shardBalancer) { + if(intervals == null) + throw new ReviewedGATKException("Unable to create schedule from intervals; no intervals were provided."); + shardBalancer.initialize(this,IntervalSharder.shardOverIntervals(SAMDataSource.this,intervals,intervalMergingRule),genomeLocParser); + return shardBalancer; + } +} + + + diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/Shard.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/Shard.java new file mode 100644 index 000000000..eb9ec480a --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/Shard.java @@ -0,0 +1,254 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.util.PeekableIterator; +import htsjdk.samtools.SAMFileSpan; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.engine.ReadMetrics; +import org.broadinstitute.gatk.engine.ReadProperties; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.HasGenomeLocation; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +/** + * + * User: aaron + * Date: Apr 10, 2009 + * Time: 5:00:27 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + +/** + * @author aaron + * @version 1.0 + * @date Apr 10, 2009 + *

    + * Interface Shard + *

    + * The base abstract class for shards. + */ +public abstract class Shard implements HasGenomeLocation { + public enum ShardType { + READ, LOCUS + } + + protected final GenomeLocParser parser; // incredibly annoying! + + /** + * What type of shard is this? Read or locus? + */ + protected final ShardType shardType; + + /** + * Locations. + */ + protected final List locs; + + /** + * Whether the current location is unmapped. + */ + private final boolean isUnmapped; + + /** + * Reads data, if applicable. + */ + private final SAMDataSource readsDataSource; + + /** + * The data backing the next chunks to deliver to the traversal engine. + */ + private final Map fileSpans; + + /** + * Lazy-calculated span of all of the genome locs in this shard + */ + private GenomeLoc spanningLocation = null; + + /** + * Statistics about which reads in this shards were used and which were filtered away. + */ + protected final ReadMetrics readMetrics = new ReadMetrics(); + + /** + * Whether this shard points to an unmapped region. + * Some shard types conceptually be unmapped (e.g. LocusShards). In + * this case, isUnmapped should always return false. + * @return True if this shard is unmapped. False otherwise. + */ + public boolean isUnmapped() { + return isUnmapped; + } + + public Shard(GenomeLocParser parser, + ShardType shardType, + List locs, + SAMDataSource readsDataSource, + Map fileSpans, + boolean isUnmapped) { + this.locs = locs; + this.parser = parser; + this.shardType = shardType; + this.readsDataSource = readsDataSource; + this.fileSpans = fileSpans; + this.isUnmapped = isUnmapped; + } + + /** + * If isUnmapped is true, than getGenomeLocs by + * definition will return a singleton list with a GenomeLoc.UNMAPPED + * + * Can return null, indicating that the entire genome is covered. + * + * @return the genome location represented by this shard + */ + public List getGenomeLocs() { + return locs; + } + + /** + * Get the list of chunks delimiting this shard. + * @return a list of chunks that contain data for this shard. + */ + public Map getFileSpans() { + return Collections.unmodifiableMap(fileSpans); + } + + /** + * Returns the span of the genomeLocs comprising this shard + * @return a GenomeLoc that starts as the first position in getGenomeLocs() and stops at the stop of the last + * position in getGenomeLocs() + */ + public GenomeLoc getLocation() { + if ( spanningLocation == null ) { + if ( getGenomeLocs() == null ) + spanningLocation = GenomeLoc.WHOLE_GENOME; + else if ( getGenomeLocs().size() == 0 ) { + spanningLocation = getGenomeLocs().get(0); + } else { + int start = Integer.MAX_VALUE; + int stop = Integer.MIN_VALUE; + String contig = null; + + for ( GenomeLoc loc : getGenomeLocs() ) { + if ( GenomeLoc.isUnmapped(loc) ) + // special case the unmapped region marker, just abort out + return loc; + contig = loc.getContig(); + if ( loc.getStart() < start ) start = loc.getStart(); + if ( loc.getStop() > stop ) stop = loc.getStop(); + } + + spanningLocation = parser.createGenomeLoc(contig, start, stop); + } + } + + return spanningLocation; + } + + + /** + * what kind of shard do we return + * @return ShardType, indicating the type + */ + public ShardType getShardType() { + return shardType; + } + + /** + * Does any releasing / aggregation required when the shard is through being processed. + */ + public void close() { + readsDataSource.incorporateReadMetrics(readMetrics); + } + + /** + * Gets key read validation and filtering properties. + * @return set of read properties associated with this shard. + */ + public ReadProperties getReadProperties() { + return readsDataSource.getReadsInfo(); + } + + /** + * Gets the runtime metrics associated with this shard. + * Retrieves a storage space of metrics about number of reads included, filtered, etc. + * @return Storage space for metrics. + */ + public ReadMetrics getReadMetrics() { + return readMetrics; + } + + /** + * Returns true if this shard is meant to buffer reads, rather + * than just holding pointers to their locations. + * @return True if this shard can buffer reads. False otherwise. + */ + public boolean buffersReads() { return false; } + + /** + * Returns true if the read buffer is currently full. + * @return True if this shard's buffer is full (and the shard can buffer reads). + */ + public boolean isBufferEmpty() { throw new UnsupportedOperationException("This shard does not buffer reads."); } + + /** + * Returns true if the read buffer is currently full. + * @return True if this shard's buffer is full (and the shard can buffer reads). + */ + public boolean isBufferFull() { throw new UnsupportedOperationException("This shard does not buffer reads."); } + + /** + * Adds a read to the read buffer. + * @param read Add a read to the internal shard buffer. + */ + public void addRead(SAMRecord read) { throw new UnsupportedOperationException("This shard does not buffer reads."); } + + /** + * Fills the shard with reads. Can only do this with shards that buffer reads + * @param readIter Iterator from which to draw the reads to fill the shard + */ + public void fill( PeekableIterator readIter ) { throw new UnsupportedOperationException("This shard does not buffer reads."); } + + /** + * Gets the iterator over the elements cached in the shard. + * @return + */ + public GATKSAMIterator iterator() { throw new UnsupportedOperationException("This shard does not buffer reads."); } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ShardBalancer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ShardBalancer.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ShardBalancer.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ShardBalancer.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/package-info.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/package-info.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/package-info.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMFileStat.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMFileStat.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMFileStat.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMFileStat.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMTagRenamer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMTagRenamer.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMTagRenamer.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMTagRenamer.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/FindLargeShards.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/FindLargeShards.java new file mode 100644 index 000000000..b6869f0b9 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/FindLargeShards.java @@ -0,0 +1,192 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads.utilities; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; +import org.broadinstitute.gatk.utils.commandline.Input; +import org.broadinstitute.gatk.utils.commandline.Output; +import org.broadinstitute.gatk.engine.datasources.reads.FilePointer; +import org.broadinstitute.gatk.engine.datasources.reads.IntervalSharder; +import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.GenomeLocSortedSet; +import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; +import org.broadinstitute.gatk.utils.interval.IntervalUtils; +import org.broadinstitute.gatk.utils.text.ListFileUtils; + +import java.io.File; +import java.io.IOException; +import java.io.PrintStream; +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.List; + +/** + * Traverses a region in a dataset looking for outliers. + */ +public class FindLargeShards extends CommandLineProgram { + private static Logger logger = Logger.getLogger(FindLargeShards.class); + + @Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = false) + public List samFiles = new ArrayList(); + + @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false) + public File referenceFile = null; + + @Input(fullName = "intervals", shortName = "L", doc = "A list of genomic intervals over which to operate. Can be explicitly specified on the command line or in a file.",required=false) + public List intervals = null; + + @Output(required=false) + public PrintStream out = System.out; + + /** + * The square of the sum of all uncompressed data. Based on the BAM spec, the size of this could be + * up to (2^64)^2. + */ + private BigInteger sumOfSquares = BigInteger.valueOf(0); + + /** + * The running sum of all uncompressed data. Based on the BAM spec, the BAM must be less than Long.MAX_LONG + * when compressed -- in other words, the sum of the sizes of all BGZF blocks must be < 2^64. + */ + private BigInteger sum = BigInteger.valueOf(0); + + /** + * The number of shards viewed. + */ + private long numberOfShards; + + + @Override + public int execute() throws IOException { + // initialize reference + IndexedFastaSequenceFile refReader = new IndexedFastaSequenceFile(referenceFile); + GenomeLocParser genomeLocParser = new GenomeLocParser(refReader); + + // initialize reads + List bamReaders = ListFileUtils.unpackBAMFileList(samFiles,parser); + SAMDataSource dataSource = new SAMDataSource(referenceFile, bamReaders, new ThreadAllocation(), null, genomeLocParser); + + // intervals + final GenomeLocSortedSet intervalSortedSet; + if ( intervals != null ) + intervalSortedSet = IntervalUtils.sortAndMergeIntervals(genomeLocParser, IntervalUtils.parseIntervalArguments(genomeLocParser, intervals), IntervalMergingRule.ALL); + else + intervalSortedSet = GenomeLocSortedSet.createSetFromSequenceDictionary(refReader.getSequenceDictionary()); + + logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize")); + + IntervalSharder sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet,IntervalMergingRule.ALL); + while(sharder.hasNext()) { + FilePointer filePointer = sharder.next(); + + // Size of the file pointer. + final long size = filePointer.size(); + + BigInteger bigSize = BigInteger.valueOf(size); + sumOfSquares = sumOfSquares.add(bigSize.pow(2)); + sum = sum.add(bigSize); + numberOfShards++; + + if(numberOfShards % 1000 == 0) { + GenomeLoc boundingRegion = getBoundingRegion(filePointer,genomeLocParser); + logger.info(String.format("PROGRESS: Calculating mean and variance: %s\t%d\t%d\t%d",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size)); + } + + } + + // Print out the stddev: (sum(x^2) - (1/N)*sum(x)^2)/N + long mean = sum.divide(BigInteger.valueOf(numberOfShards)).longValue(); + long stddev = (long)(Math.sqrt(sumOfSquares.subtract(sum.pow(2).divide(BigInteger.valueOf(numberOfShards))).divide(BigInteger.valueOf(numberOfShards)).doubleValue())); + logger.info(String.format("Number of shards: %d; mean uncompressed size = %d; stddev uncompressed size = %d%n",numberOfShards,mean,stddev)); + + // Crank through the shards again, this time reporting on the shards significantly larger than the mean. + long threshold = mean + stddev*5; + logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize")); + out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n"); + + sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet,IntervalMergingRule.ALL); + while(sharder.hasNext()) { + FilePointer filePointer = sharder.next(); + + // Bounding region. + GenomeLoc boundingRegion = getBoundingRegion(filePointer,genomeLocParser); + + // Size of the file pointer. + final long size = filePointer.size(); + + numberOfShards++; + + if(filePointer.size() <= threshold) { + if(numberOfShards % 1000 == 0) + logger.info(String.format("PROGRESS: Searching for large shards: %s\t%d\t%d\t%d",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size)); + continue; + } + + out.printf("%s\t%d\t%d\t%d%n",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size); + } + + return 0; + } + + private GenomeLoc getBoundingRegion(final FilePointer filePointer, final GenomeLocParser genomeLocParser) { + List regions = filePointer.getLocations(); + + // The region contained by this FilePointer. + final String contig = regions.get(0).getContig(); + final int start = regions.get(0).getStart(); + final int stop = regions.get(regions.size()-1).getStop(); + + return genomeLocParser.createGenomeLoc(contig,start,stop); + } + + /** + * Required main method implementation. + * @param argv Command-line argument text. + * @throws Exception on error. + */ + public static void main(String[] argv) throws Exception { + int returnCode = 0; + try { + FindLargeShards instance = new FindLargeShards(); + start(instance, argv); + returnCode = 0; + } + catch(Exception ex) { + returnCode = 1; + ex.printStackTrace(); + throw ex; + } + finally { + System.exit(returnCode); + } + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBAMRegion.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBAMRegion.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBAMRegion.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBAMRegion.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBGZFBounds.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBGZFBounds.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBGZFBounds.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBGZFBounds.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/UnzipSingleBlock.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/UnzipSingleBlock.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/UnzipSingleBlock.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/UnzipSingleBlock.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/package-info.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/package-info.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/package-info.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSource.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSource.java new file mode 100644 index 000000000..6b7bf2187 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSource.java @@ -0,0 +1,166 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reference; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.SAMSequenceRecord; +import org.broadinstitute.gatk.engine.datasources.reads.LocusShard; +import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; +import org.broadinstitute.gatk.engine.datasources.reads.Shard; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.GenomeLocSortedSet; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; + +import java.io.File; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * Loads reference data from fasta file + * Looks for fai and dict files, and tries to create them if they don't exist + */ +public class ReferenceDataSource { + private IndexedFastaSequenceFile reference; + + /** our log, which we want to capture anything from this class */ + protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(ReferenceDataSource.class); + + /** + * Create reference data source from fasta file + * @param fastaFile Fasta file to be used as reference + */ + public ReferenceDataSource(File fastaFile) { + reference = CachingIndexedFastaSequenceFile.checkAndCreate(fastaFile); + } + + /** + * Get indexed fasta file + * @return IndexedFastaSequenceFile that was created from file + */ + public IndexedFastaSequenceFile getReference() { + return this.reference; + } + + /** + * Creates an iterator for processing the entire reference. + * @param readsDataSource the reads datasource to embed in the locus shard. + * @param parser used to generate/regenerate intervals. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources. + * @param maxShardSize The maximum shard size which can be used to create this list. + * @return Creates a schedule for performing a traversal over the entire reference. + */ + public Iterable createShardsOverEntireReference(final SAMDataSource readsDataSource, final GenomeLocParser parser, final int maxShardSize) { + List shards = new ArrayList(); + for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) { + for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) { + final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength()); + shards.add(new LocusShard(parser, + readsDataSource, + Collections.singletonList(parser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)), + null)); + } + } + return shards; + } + + + public Iterable createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int maxShardSize) { + List shards = new ArrayList(); + + for(GenomeLoc interval: intervals) { + while(interval.size() > maxShardSize) { + shards.add(new LocusShard(intervals.getGenomeLocParser(), + readsDataSource, + Collections.singletonList(intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)), + null)); + interval = intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop()); + } + shards.add(new LocusShard(intervals.getGenomeLocParser(), + readsDataSource, + Collections.singletonList(interval), + null)); + } + + return shards; + } + + + /** + * Creates an iterator for processing the entire reference. + * @param readsDataSource the reads datasource to embed in the locus shard. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources. + * @param intervals the list of intervals to use when processing the reference. + * @param targetShardSize the suggested - and maximum - shard size which can be used to create this list; we will merge intervals greedily so that we generate shards up to but not greater than the target size. + * @return Creates a schedule for performing a traversal over the entire reference. + */ +/* + public Iterable createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int targetShardSize) { + final List shards = new ArrayList(); + final GenomeLocParser parser = intervals.getGenomeLocParser(); + LinkedList currentIntervals = new LinkedList(); + + for(GenomeLoc interval: intervals) { + // if the next interval is too big, we can safely shard currentInterval and then break down this one + if (interval.size() > targetShardSize) { + if (!currentIntervals.isEmpty()) + shards.add(createShardFromInterval(currentIntervals, readsDataSource, parser)); + while(interval.size() > targetShardSize) { + final GenomeLoc partialInterval = parser.createGenomeLoc(interval.getContig(), interval.getStart(), interval.getStart()+targetShardSize-1); + shards.add(createShardFromInterval(Collections.singletonList(partialInterval), readsDataSource, parser)); + interval = parser.createGenomeLoc(interval.getContig(), interval.getStart() + targetShardSize, interval.getStop()); + } + currentIntervals = new LinkedList(); + currentIntervals.add(interval); + } + // otherwise, we need to check whether we can merge this interval with currentInterval (and either shard currentInterval or merge accordingly) + else { + if (currentIntervals.isEmpty()) { + currentIntervals.add(interval); + } + else { + if (currentIntervals.getLast().compareContigs(interval) != 0 || interval.getStop() - currentIntervals.getLast().getStart() + 1 > targetShardSize) { + shards.add(createShardFromInterval(currentIntervals, readsDataSource, parser)); + currentIntervals = new LinkedList(); + } + currentIntervals.add(interval); + } + } + } + if (!currentIntervals.isEmpty()) + shards.add(createShardFromInterval(currentIntervals, readsDataSource, parser)); + return shards; + } + + private static Shard createShardFromInterval(final List intervals, final SAMDataSource readsDataSource, final GenomeLocParser parser) { + //logger.debug("Adding shard " + interval); + return new LocusShard(parser, + readsDataSource, + intervals, + null); + } +*/ +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/package-info.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/package-info.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/package-info.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/DataStreamSegment.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/DataStreamSegment.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/DataStreamSegment.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/DataStreamSegment.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/EntireStream.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/EntireStream.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/EntireStream.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/EntireStream.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/MappedStreamSegment.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/MappedStreamSegment.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/MappedStreamSegment.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/MappedStreamSegment.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPool.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPool.java new file mode 100644 index 000000000..6920ba242 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPool.java @@ -0,0 +1,153 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.rmd; + +import htsjdk.samtools.SAMSequenceDictionary; +import org.broadinstitute.gatk.utils.refdata.SeekableRODIterator; +import org.broadinstitute.gatk.utils.refdata.tracks.RMDTrack; +import org.broadinstitute.gatk.utils.refdata.tracks.RMDTrackBuilder; +import org.broadinstitute.gatk.utils.refdata.utils.FlashBackIterator; +import org.broadinstitute.gatk.utils.refdata.utils.LocationAwareSeekableRODIterator; +import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.util.List; + +/** + * A pool of reference-ordered data iterators. + */ +class ReferenceOrderedDataPool extends ResourcePool { + // the reference-ordered data itself. + private final RMDTriplet fileDescriptor; + + // our tribble track builder + private final RMDTrackBuilder builder; + + /** + * The header from this RMD, if present. + */ + private final Object header; + + /** + * The sequence dictionary from this ROD. If no sequence dictionary is present, this dictionary will be the same as the reference's. + */ + private final SAMSequenceDictionary sequenceDictionary; + + boolean flashbackData = false; + public ReferenceOrderedDataPool(RMDTriplet fileDescriptor,RMDTrackBuilder builder,SAMSequenceDictionary sequenceDictionary, GenomeLocParser genomeLocParser,boolean flashbackData) { + super(sequenceDictionary,genomeLocParser); + this.fileDescriptor = fileDescriptor; + this.builder = builder; + this.flashbackData = flashbackData; + + // prepopulate one RMDTrack + LocationAwareSeekableRODIterator iterator = createNewResource(); + this.addNewResource(iterator); + + // Pull the proper header and sequence dictionary from the prepopulated track. + this.header = iterator.getHeader(); + this.sequenceDictionary = iterator.getSequenceDictionary(); + } + + /** + * Gets the header used by this resource pool. + * @return Header used by this resource pool. + */ + public Object getHeader() { + return header; + } + + /** + * Gets the sequence dictionary built into the ROD index file. + * @return Sequence dictionary from the index file. + */ + public SAMSequenceDictionary getSequenceDictionary() { + return sequenceDictionary; + } + + /** + * Create a new iterator from the existing reference-ordered data. This new iterator is expected + * to be completely independent of any other iterator. + * @return The newly created resource. + */ + public LocationAwareSeekableRODIterator createNewResource() { + if(numIterators() > 0) + throw new ReviewedGATKException("BUG: Tried to create multiple iterators over streaming ROD interface"); + RMDTrack track = builder.createInstanceOfTrack(fileDescriptor); + LocationAwareSeekableRODIterator iter = new SeekableRODIterator(track.getHeader(),track.getSequenceDictionary(),referenceSequenceDictionary,genomeLocParser,track.getIterator()); + return (flashbackData) ? new FlashBackIterator(iter) : iter; + } + + /** + * Finds the best existing ROD iterator from the pool. In this case, the best existing ROD is defined as + * the first one encountered that is at or before the given position. + * @param segment @{inheritedDoc} + * @param resources @{inheritedDoc} + * @return @{inheritedDoc} + */ + public LocationAwareSeekableRODIterator selectBestExistingResource( DataStreamSegment segment, List resources ) { + if(segment instanceof MappedStreamSegment) { + GenomeLoc position = ((MappedStreamSegment)segment).getLocation(); + + for( LocationAwareSeekableRODIterator RODIterator : resources ) { + + if( (RODIterator.position() == null && RODIterator.hasNext()) || + (RODIterator.position() != null && RODIterator.position().isBefore(position)) ) + return RODIterator; + if (RODIterator.position() != null && RODIterator instanceof FlashBackIterator && ((FlashBackIterator)RODIterator).canFlashBackTo(position)) { + ((FlashBackIterator)RODIterator).flashBackTo(position); + return RODIterator; + } + + } + return null; + } + else if(segment instanceof EntireStream) { + // Asking for a segment over the entire stream, so by definition, there is no best existing resource. + // Force the system to create a new one. + return null; + } + else { + throw new ReviewedGATKException("Unable to find a ROD iterator for segments of type " + segment.getClass()); + } + } + + /** + * In this case, the iterator is the resource. Pass it through. + */ + public LocationAwareSeekableRODIterator createIteratorFromResource( DataStreamSegment segment, LocationAwareSeekableRODIterator resource ) { + return resource; + } + + /** + * kill the buffers in the iterator + */ + public void closeResource( LocationAwareSeekableRODIterator resource ) { + if (resource instanceof FlashBackIterator) ((FlashBackIterator)resource).close(); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataSource.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataSource.java new file mode 100644 index 000000000..e90cb8047 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataSource.java @@ -0,0 +1,257 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.rmd; + +import htsjdk.samtools.SAMSequenceDictionary; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.utils.refdata.SeekableRODIterator; +import org.broadinstitute.gatk.utils.refdata.tracks.RMDTrack; +import org.broadinstitute.gatk.utils.refdata.tracks.RMDTrackBuilder; +import org.broadinstitute.gatk.utils.refdata.utils.LocationAwareSeekableRODIterator; +import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.lang.reflect.Type; +import java.util.List; + +/** + * A data source which provides a single type of reference-ordered data. + */ +public class ReferenceOrderedDataSource { + /** + * The reference-ordered data itself. + */ + private final RMDTriplet fileDescriptor; + + /** + * The header associated with this VCF, if any. + */ + private final Object header; + + /** + * The private sequence dictionary associated with this RMD. + */ + private final SAMSequenceDictionary sequenceDictionary; + + /** + * The builder to use when constructing new reference-ordered data readers. + */ + private final RMDTrackBuilder builder; + + /** + * A pool of iterators for navigating through the genome. + */ + private final ResourcePool iteratorPool; + + /** + * Create a new reference-ordered data source. + */ + public ReferenceOrderedDataSource(RMDTriplet fileDescriptor, + RMDTrackBuilder builder, + SAMSequenceDictionary referenceSequenceDictionary, + GenomeLocParser genomeLocParser, + boolean flashbackData ) { + this.fileDescriptor = fileDescriptor; + this.builder = builder; + + // TODO: Unify the two blocks of code below by creating a ReferenceOrderedDataPool base class of a coherent type (not RMDTrack for one and SeekableIterator for the other). + if (fileDescriptor.getStorageType() != RMDTriplet.RMDStorageType.STREAM) { + iteratorPool = new ReferenceOrderedQueryDataPool(fileDescriptor, + builder, + referenceSequenceDictionary, + genomeLocParser); + this.header = ((ReferenceOrderedQueryDataPool)iteratorPool).getHeader(); + this.sequenceDictionary = ((ReferenceOrderedQueryDataPool)iteratorPool).getSequenceDictionary(); + } + else { + iteratorPool = new ReferenceOrderedDataPool(fileDescriptor, + builder, + referenceSequenceDictionary, + genomeLocParser, + flashbackData); + this.header = ((ReferenceOrderedDataPool)iteratorPool).getHeader(); + this.sequenceDictionary = ((ReferenceOrderedDataPool)iteratorPool).getSequenceDictionary(); + } + } + + /** + * Return the name of the underlying reference-ordered data. + * @return Name of the underlying rod. + */ + public String getName() { + return fileDescriptor.getName(); + } + + public Class getType() { + return builder.getFeatureManager().getByTriplet(fileDescriptor).getCodecClass(); + } + + public Class getRecordType() { + return builder.getFeatureManager().getByTriplet(fileDescriptor).getFeatureClass(); + } + + public File getFile() { + return new File(fileDescriptor.getFile()); + } + + public Object getHeader() { + return header; + } + + public Tags getTags() { + return fileDescriptor.getTags(); + } + + public String getTagValue( final String key ) { + return fileDescriptor.getTags().getValue( key ); + } + + + /** + * Retrieves the sequence dictionary created by this ROD. + * @return + */ + public SAMSequenceDictionary getSequenceDictionary() { + return sequenceDictionary; + } + + /** + * helper function for determining if we are the same track based on name and record type + * + * @param name the name to match + * @param type the type to match + * + * @return true on a match, false if the name or type is different + */ + public boolean matchesNameAndRecordType(String name, Type type) { + return (name.equals(fileDescriptor.getName()) && (type.getClass().isAssignableFrom(getType().getClass()))); + } + + /** + * Seek to the specified position and return an iterator through the data. + * + * @param loc GenomeLoc that points to the selected position. + * + * @return Iterator through the data. + */ + public LocationAwareSeekableRODIterator seek(GenomeLoc loc) { + DataStreamSegment dataStreamSegment = loc != null ? new MappedStreamSegment(loc) : new EntireStream(); + return iteratorPool.iterator(dataStreamSegment); + } + + + /** + * Close the specified iterator, returning it to the pool. + * @param iterator Iterator to close. + */ + public void close( LocationAwareSeekableRODIterator iterator ) { + iteratorPool.release(iterator); + } + +} + +/** + * a data pool for the new query based RODs + */ +class ReferenceOrderedQueryDataPool extends ResourcePool { + // the reference-ordered data itself. + private final RMDTriplet fileDescriptor; + + // our tribble track builder + private final RMDTrackBuilder builder; + + /** + * The header from this RMD, if present. + */ + private final Object header; + + /** + * The sequence dictionary from this ROD. If no sequence dictionary is present, this dictionary will be the same as the reference's. + */ + private final SAMSequenceDictionary sequenceDictionary; + + public ReferenceOrderedQueryDataPool(RMDTriplet fileDescriptor, RMDTrackBuilder builder, SAMSequenceDictionary referenceSequenceDictionary, GenomeLocParser genomeLocParser) { + super(referenceSequenceDictionary,genomeLocParser); + this.fileDescriptor = fileDescriptor; + this.builder = builder; + + // prepopulate one RMDTrack + RMDTrack track = builder.createInstanceOfTrack(fileDescriptor); + this.addNewResource(track); + + // Pull the proper header and sequence dictionary from the prepopulated track. + this.header = track.getHeader(); + this.sequenceDictionary = track.getSequenceDictionary(); + } + + public Object getHeader() { + return header; + } + + public SAMSequenceDictionary getSequenceDictionary() { + return sequenceDictionary; + } + + @Override + protected RMDTrack createNewResource() { + return builder.createInstanceOfTrack(fileDescriptor); + } + + @Override + protected RMDTrack selectBestExistingResource(DataStreamSegment segment, List availableResources) { + for (RMDTrack reader : availableResources) + if (reader != null) return reader; + return null; + } + + @Override + protected LocationAwareSeekableRODIterator createIteratorFromResource(DataStreamSegment position, RMDTrack track) { + try { + if (position instanceof MappedStreamSegment) { + GenomeLoc pos = ((MappedStreamSegment) position).locus; + return new SeekableRODIterator(header,sequenceDictionary,referenceSequenceDictionary,genomeLocParser,track.query(pos)); + } else { + return new SeekableRODIterator(header,sequenceDictionary,referenceSequenceDictionary,genomeLocParser,track.getIterator()); + } + } catch (FileNotFoundException e) { + throw new UserException.CouldNotReadInputFile(fileDescriptor.getName(), "it could not be found"); + } catch (IOException | RuntimeException e) { + throw new ReviewedGATKException("Unable to create iterator for rod named " + fileDescriptor.getName(),e); + } + + } + + @Override + protected void closeResource(RMDTrack track) { + track.close(); + } +} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ResourcePool.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ResourcePool.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ResourcePool.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ResourcePool.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/package-info.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/package-info.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/package-info.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/Accumulator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/Accumulator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/Accumulator.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/Accumulator.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroScheduler.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroScheduler.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroScheduler.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroScheduler.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroSchedulerMBean.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroSchedulerMBean.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroSchedulerMBean.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroSchedulerMBean.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/LinearMicroScheduler.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/LinearMicroScheduler.java new file mode 100644 index 000000000..fc68b9c7a --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/LinearMicroScheduler.java @@ -0,0 +1,130 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.executive; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.datasources.providers.LocusShardDataProvider; +import org.broadinstitute.gatk.engine.datasources.providers.ReadShardDataProvider; +import org.broadinstitute.gatk.engine.datasources.providers.ShardDataProvider; +import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; +import org.broadinstitute.gatk.engine.datasources.reads.Shard; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.engine.io.DirectOutputTracker; +import org.broadinstitute.gatk.engine.io.OutputTracker; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.engine.traversals.TraversalEngine; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.sam.ReadUtils; +import org.broadinstitute.gatk.utils.threading.ThreadEfficiencyMonitor; + +import java.util.Collection; + + +/** A micro-scheduling manager for single-threaded execution of a traversal. */ +public class LinearMicroScheduler extends MicroScheduler { + + /** + * A direct output tracker for directly managing output. + */ + private DirectOutputTracker outputTracker = new DirectOutputTracker(); + + /** + * Create a new linear microscheduler to process the given reads and reference. + * + * @param walker Walker for the traversal. + * @param reads Reads file(s) to process. + * @param reference Reference for driving the traversal. + * @param rods Reference-ordered data. + */ + protected LinearMicroScheduler(final GenomeAnalysisEngine engine, + final Walker walker, + final SAMDataSource reads, + final IndexedFastaSequenceFile reference, + final Collection rods, + final ThreadAllocation threadAllocation) { + super(engine, walker, reads, reference, rods, threadAllocation); + + if ( threadAllocation.monitorThreadEfficiency() ) + setThreadEfficiencyMonitor(new ThreadEfficiencyMonitor()); + } + + /** + * Run this traversal over the specified subsection of the dataset. + * + * @param walker Computation to perform over dataset. + * @param shardStrategy A strategy for sharding the data. + */ + public Object execute(Walker walker, Iterable shardStrategy) { + super.startingExecution(); + walker.initialize(); + Accumulator accumulator = Accumulator.create(engine,walker); + + boolean done = walker.isDone(); + int counter = 0; + + final TraversalEngine traversalEngine = borrowTraversalEngine(this); + for (Shard shard : shardStrategy ) { + if ( abortExecution() || done || shard == null ) // we ran out of shards that aren't owned + break; + + if(shard.getShardType() == Shard.ShardType.LOCUS) { + WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), + getReadIterator(shard), shard.getGenomeLocs(), ReadUtils.getSAMFileSamples(engine.getSAMFileHeader())); + for(WindowMaker.WindowMakerIterator iterator: windowMaker) { + ShardDataProvider dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),engine.getGenomeLocParser(),iterator.getLocus(),iterator,reference,rods); + Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); + accumulator.accumulate(dataProvider,result); + dataProvider.close(); + if ( walker.isDone() ) break; + } + windowMaker.close(); + } + else { + ShardDataProvider dataProvider = new ReadShardDataProvider(shard,engine.getGenomeLocParser(),getReadIterator(shard),reference,rods); + Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); + accumulator.accumulate(dataProvider,result); + dataProvider.close(); + } + + done = walker.isDone(); + } + + Object result = accumulator.finishTraversal(); + + outputTracker.close(); + returnTraversalEngine(this, traversalEngine); + cleanup(); + executionIsDone(); + + return accumulator; + } + + /** + * @{inheritDoc} + */ + public OutputTracker getOutputTracker() { return outputTracker; } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java new file mode 100644 index 000000000..f9660a94a --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java @@ -0,0 +1,463 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.executive; + +import com.google.java.contract.Ensures; +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.ReadMetrics; +import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; +import org.broadinstitute.gatk.engine.datasources.reads.Shard; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.engine.io.OutputTracker; +import org.broadinstitute.gatk.engine.iterators.NullSAMIterator; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.engine.traversals.*; +import org.broadinstitute.gatk.engine.walkers.*; +import org.broadinstitute.gatk.utils.AutoFormattingTime; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.progressmeter.ProgressMeter; +import org.broadinstitute.gatk.utils.threading.ThreadEfficiencyMonitor; + +import javax.management.JMException; +import javax.management.MBeanServer; +import javax.management.ObjectName; +import java.io.File; +import java.lang.management.ManagementFactory; +import java.util.*; + + +/** + * Created by IntelliJ IDEA. + * User: mhanna + * Date: Apr 26, 2009 + * Time: 12:37:23 PM + * + * General base class for all scheduling algorithms + * Shards and schedules data in manageable chunks. + * + * Creates N TraversalEngines for each data thread for the MicroScheduler. This is necessary + * because in the HMS case you have multiple threads executing a traversal engine independently, and + * these engines may need to create separate resources for efficiency or implementation reasons. For example, + * the nanoScheduler creates threads to implement the traversal, and this creation is instance specific. + * So each HMS thread needs to have it's own distinct copy of the traversal engine if it wants to have + * N data threads x M nano threads => N * M threads total. These are borrowed from this microscheduler + * and returned when done. Also allows us to tracks all created traversal engines so this microscheduler + * can properly shut them all down when the scheduling is done. + * + */ +public abstract class MicroScheduler implements MicroSchedulerMBean { + protected static final Logger logger = Logger.getLogger(MicroScheduler.class); + + /** + * The list of all Traversal engines we've created in this micro scheduler + */ + final List allCreatedTraversalEngines = new LinkedList(); + + /** + * All available engines. Engines are borrowed and returned when a subclass is actually + * going to execute the engine on some data. This allows us to have N copies for + * N data parallel executions, but without the dangerous code of having local + * ThreadLocal variables. + */ + final LinkedList availableTraversalEngines = new LinkedList(); + + /** + * Engines that have been allocated to a key already. + */ + final HashMap allocatedTraversalEngines = new HashMap(); + + /** + * Counts the number of instances of the class that are currently alive. + */ + private static int instanceNumber = 0; + + /** + * The engine invoking this scheduler. + */ + protected final GenomeAnalysisEngine engine; + + protected final IndexedFastaSequenceFile reference; + + private final SAMDataSource reads; + protected final Collection rods; + + private final MBeanServer mBeanServer; + private final ObjectName mBeanName; + + /** + * Threading efficiency monitor for tracking the resource utilization of the GATK + * + * may be null + */ + ThreadEfficiencyMonitor threadEfficiencyMonitor = null; + + /** + * MicroScheduler factory function. Create a microscheduler appropriate for reducing the + * selected walker. + * + * @param walker Which walker to use. + * @param reads the informations associated with the reads + * @param reference the reference file + * @param rods the rods to include in the traversal + * @param threadAllocation Number of threads to utilize. + * + * @return The best-fit microscheduler. + */ + public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { + if ( threadAllocation.isRunningInParallelMode() ) { + logger.info(String.format("Running the GATK in parallel mode with %d total threads, " + + "%d CPU thread(s) for each of %d data thread(s), of %d processors available on this machine", + threadAllocation.getTotalNumThreads(), + threadAllocation.getNumCPUThreadsPerDataThread(), + threadAllocation.getNumDataThreads(), + Runtime.getRuntime().availableProcessors())); + if ( threadAllocation.getTotalNumThreads() > Runtime.getRuntime().availableProcessors() ) + logger.warn(String.format("Number of requested GATK threads %d is more than the number of " + + "available processors on this machine %d", threadAllocation.getTotalNumThreads(), + Runtime.getRuntime().availableProcessors())); + } + + if ( threadAllocation.getNumDataThreads() > 1 ) { + if (walker.isReduceByInterval()) + throw new UserException.BadArgumentValue("nt", String.format("This run of %s is set up to aggregate results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option or check if this tool has an option to disable per-interval calculations.", engine.getWalkerName(walker.getClass()))); + + if ( ! (walker instanceof TreeReducible) ) { + throw badNT("nt", engine, walker); + } + } + + if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof NanoSchedulable) ) { + throw badNT("nct", engine, walker); + } + + if ( threadAllocation.getNumDataThreads() > 1 ) { + return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); + } else { + return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); + } + } + + private static UserException badNT(final String parallelArg, final GenomeAnalysisEngine engine, final Walker walker) { + throw new UserException.BadArgumentValue(parallelArg, + String.format("The analysis %s currently does not support parallel execution with %s. " + + "Please run your analysis without the %s option.", engine.getWalkerName(walker.getClass()), parallelArg, parallelArg)); + } + + /** + * Create a microscheduler given the reads and reference. + * + * @param walker the walker to execute with + * @param reads The reads. + * @param reference The reference. + * @param rods the rods to include in the traversal + * @param threadAllocation the allocation of threads to use in the underlying traversal + */ + protected MicroScheduler(final GenomeAnalysisEngine engine, + final Walker walker, + final SAMDataSource reads, + final IndexedFastaSequenceFile reference, + final Collection rods, + final ThreadAllocation threadAllocation) { + this.engine = engine; + this.reads = reads; + this.reference = reference; + this.rods = rods; + + final File progressLogFile = engine.getArguments() == null ? null : engine.getArguments().performanceLog; + + // Creates uninitialized TraversalEngines appropriate for walker and threadAllocation, + // and adds it to the list of created engines for later shutdown. + for ( int i = 0; i < threadAllocation.getNumDataThreads(); i++ ) { + final TraversalEngine traversalEngine = createTraversalEngine(walker, threadAllocation); + allCreatedTraversalEngines.add(traversalEngine); + availableTraversalEngines.add(traversalEngine); + } + + // Create the progress meter, and register it with the analysis engine + engine.registerProgressMeter(new ProgressMeter(progressLogFile, + availableTraversalEngines.peek().getTraversalUnits(), + engine.getRegionsOfGenomeBeingProcessed())); + + // Now that we have a progress meter, go through and initialize the traversal engines + for ( final TraversalEngine traversalEngine : allCreatedTraversalEngines ) + traversalEngine.initialize(engine, walker, engine.getProgressMeter()); + + // JMX does not allow multiple instances with the same ObjectName to be registered with the same platform MXBean. + // To get around this limitation and since we have no job identifier at this point, register a simple counter that + // will count the number of instances of this object that have been created in this JVM. + int thisInstance = instanceNumber++; + mBeanServer = ManagementFactory.getPlatformMBeanServer(); + try { + mBeanName = new ObjectName("org.broadinstitute.gatk.engine.executive:type=MicroScheduler,instanceNumber="+thisInstance); + mBeanServer.registerMBean(this, mBeanName); + } + catch (JMException ex) { + throw new ReviewedGATKException("Unable to register microscheduler with JMX", ex); + } + } + + /** + * Really make us a traversal engine of the appropriate type for walker and thread allocation + * + * @return a non-null uninitialized traversal engine + */ + @Ensures("result != null") + private TraversalEngine createTraversalEngine(final Walker walker, final ThreadAllocation threadAllocation) { + if (walker instanceof ReadWalker) { + return new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread()); + } else if (walker instanceof LocusWalker) { + return new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread()); + } else if (walker instanceof DuplicateWalker) { + return new TraverseDuplicates(); + } else if (walker instanceof ReadPairWalker) { + return new TraverseReadPairs(); + } else if (walker instanceof ActiveRegionWalker) { + return new TraverseActiveRegions(threadAllocation.getNumCPUThreadsPerDataThread()); + } else { + throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type."); + } + } + + + /** + * Return the ThreadEfficiencyMonitor we are using to track our resource utilization, if there is one + * + * @return the monitor, or null if none is active + */ + public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() { + return threadEfficiencyMonitor; + } + + /** + * Inform this Microscheduler to use the efficiency monitor used to create threads in subclasses + * + * @param threadEfficiencyMonitor + */ + public void setThreadEfficiencyMonitor(final ThreadEfficiencyMonitor threadEfficiencyMonitor) { + this.threadEfficiencyMonitor = threadEfficiencyMonitor; + } + + /** + * Should we stop all execution work and exit gracefully? + * + * Returns true in the case where some external signal or time limit has been received, indicating + * that this GATK shouldn't continue executing. This isn't a kill signal, it is really a "shutdown + * gracefully at the next opportunity" signal. Concrete implementations of the MicroScheduler + * examine this value as often as reasonable and, if it returns true, stop what they are doing + * at the next available opportunity, shutdown their resources, call notify done, and return. + * + * @return true if we should abort execution, or false otherwise + */ + protected boolean abortExecution() { + final boolean abort = engine.exceedsRuntimeLimit(); + if ( abort ) { + final AutoFormattingTime aft = new AutoFormattingTime(engine.getRuntimeLimitInNanoseconds(), -1, 4); + logger.info("Aborting execution (cleanly) because the runtime has exceeded the requested maximum " + aft); + } + return abort; + } + + /** + * Walks a walker over the given list of intervals. + * + * @param walker Computation to perform over dataset. + * @param shardStrategy A strategy for sharding the data. + * + * @return the return type of the walker + */ + public abstract Object execute(Walker walker, Iterable shardStrategy); + + /** + * Tells this MicroScheduler that the execution of one of the subclass of this object as started + * + * Must be called when the implementation of execute actually starts up + * + * Currently only starts the progress meter timer running, but other start up activities could be incorporated + */ + protected void startingExecution() { + engine.getProgressMeter().start(); + } + + /** + * Retrieves the object responsible for tracking and managing output. + * @return An output tracker, for loading data in and extracting results. Will not be null. + */ + public abstract OutputTracker getOutputTracker(); + + /** + * Gets the an iterator over the given reads, which will iterate over the reads in the given shard. + * @param shard the shard to use when querying reads. + * @return an iterator over the reads specified in the shard. + */ + protected GATKSAMIterator getReadIterator(Shard shard) { + return (!reads.isEmpty()) ? reads.seek(shard) : new NullSAMIterator(); + } + + /** + * Must be called by subclasses when execute is done + */ + protected void executionIsDone() { + engine.getProgressMeter().notifyDone(engine.getCumulativeMetrics().getNumIterations()); + printReadFilteringStats(); + shutdownTraversalEngines(); + + // Print out the threading efficiency of this HMS, if state monitoring is enabled + if ( threadEfficiencyMonitor != null ) { + // include the master thread information + threadEfficiencyMonitor.threadIsDone(Thread.currentThread()); + threadEfficiencyMonitor.printUsageInformation(logger); + } + } + + /** + * Shutdown all of the created engines, and clear the list of created engines, dropping + * pointers to the traversal engines + */ + public synchronized void shutdownTraversalEngines() { + for ( final TraversalEngine te : allCreatedTraversalEngines) + te.shutdown(); + + allCreatedTraversalEngines.clear(); + availableTraversalEngines.clear(); + } + + /** + * Prints out information about number of reads observed and filtering, if any reads were used in the traversal + * + * Looks like: + * + * INFO 10:40:47,370 MicroScheduler - 22 reads were filtered out during traversal out of 101 total (21.78%) + * INFO 10:40:47,370 MicroScheduler - -> 1 reads (0.99% of total) failing BadMateFilter + * INFO 10:40:47,370 MicroScheduler - -> 20 reads (19.80% of total) failing DuplicateReadFilter + * INFO 10:40:47,370 MicroScheduler - -> 1 reads (0.99% of total) failing FailsVendorQualityCheckFilter + */ + private void printReadFilteringStats() { + final ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics(); + if ( cumulativeMetrics.getNumReadsSeen() > 0 ) { + // count up the number of skipped reads by summing over all filters + long nSkippedReads = 0L; + for ( final long countsByFilter : cumulativeMetrics.getCountsByFilter().values()) + nSkippedReads += countsByFilter; + + logger.info(String.format("%d reads were filtered out during the traversal out of approximately %d total reads (%.2f%%)", + nSkippedReads, + cumulativeMetrics.getNumReadsSeen(), + 100.0 * MathUtils.ratio(nSkippedReads, cumulativeMetrics.getNumReadsSeen()))); + + for ( final Map.Entry filterCounts : cumulativeMetrics.getCountsByFilter().entrySet() ) { + long count = filterCounts.getValue(); + logger.info(String.format(" -> %d reads (%.2f%% of total) failing %s", + count, 100.0 * MathUtils.ratio(count,cumulativeMetrics.getNumReadsSeen()), filterCounts.getKey())); + } + } + } + + /** + * Gets the engine that created this microscheduler. + * @return The engine owning this microscheduler. + */ + public GenomeAnalysisEngine getEngine() { return engine; } + + /** + * Returns data source maintained by this scheduler + * @return + */ + public SAMDataSource getSAMDataSource() { return reads; } + + /** + * Returns the reference maintained by this scheduler. + * @return The reference maintained by this scheduler. + */ + public IndexedFastaSequenceFile getReference() { return reference; } + + protected void cleanup() { + try { + mBeanServer.unregisterMBean(mBeanName); + } + catch (JMException ex) { + throw new ReviewedGATKException("Unable to unregister microscheduler with JMX", ex); + } + } + + /** + * Returns a traversal engine suitable for use, associated with key + * + * Key is an arbitrary object that is used to retrieve the same traversal + * engine over and over. This can be important in the case where the + * traversal engine has data associated with it in some other context, + * and we need to ensure that the context always sees the same traversal + * engine. This happens in the HierarchicalMicroScheduler, where you want + * the a thread executing traversals to retrieve the same engine each time, + * as outputs are tracked w.r.t. that engine. + * + * If no engine is associated with key yet, pops the next available engine + * from the available ones maintained by this + * microscheduler. Note that it's a runtime error to pop a traversal engine + * from this scheduler if there are none available. Callers that + * once pop'd an engine for use must return it with returnTraversalEngine + * + * @param key the key to associate with this engine + * @return a non-null TraversalEngine suitable for execution in this scheduler + */ + @Ensures("result != null") + protected synchronized TraversalEngine borrowTraversalEngine(final Object key) { + if ( key == null ) throw new IllegalArgumentException("key cannot be null"); + + final TraversalEngine engine = allocatedTraversalEngines.get(key); + if ( engine == null ) { + if ( availableTraversalEngines.isEmpty() ) + throw new IllegalStateException("no traversal engines were available"); + allocatedTraversalEngines.put(key, availableTraversalEngines.pop()); + return allocatedTraversalEngines.get(key); + } else { + return engine; + } + } + + /** + * Return a borrowed traversal engine to this MicroScheduler, for later use + * in another traversal execution + * + * @param key the key used to id the engine, provided to the borrowTraversalEngine function + * @param traversalEngine the borrowed traversal engine. Must have been previously borrowed. + */ + protected synchronized void returnTraversalEngine(final Object key, final TraversalEngine traversalEngine) { + if ( traversalEngine == null ) + throw new IllegalArgumentException("Attempting to push a null traversal engine"); + if ( ! allCreatedTraversalEngines.contains(traversalEngine) ) + throw new IllegalArgumentException("Attempting to push a traversal engine not created by this MicroScheduler" + engine); + if ( ! allocatedTraversalEngines.containsKey(key) ) + throw new IllegalArgumentException("No traversal engine was never checked out with key " + key); + + // note there's nothing to actually do here, but a function implementation + // might want to do something + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/MicroSchedulerMBean.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroSchedulerMBean.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/MicroSchedulerMBean.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroSchedulerMBean.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/OutputMergeTask.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/OutputMergeTask.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/OutputMergeTask.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/OutputMergeTask.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/ReduceTree.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/ReduceTree.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/ReduceTree.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/ReduceTree.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/ShardTraverser.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/ShardTraverser.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/ShardTraverser.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/ShardTraverser.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/TreeReducer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/TreeReducer.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/TreeReducer.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/TreeReducer.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/WindowMaker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/WindowMaker.java new file mode 100644 index 000000000..496178d88 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/WindowMaker.java @@ -0,0 +1,218 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.executive; + +import htsjdk.samtools.util.PeekableIterator; +import org.broadinstitute.gatk.engine.ReadProperties; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.engine.datasources.reads.Shard; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecordIterator; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.locusiterator.LocusIterator; +import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState; + +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.NoSuchElementException; + +/** + * Transforms an iterator of reads which overlap the given interval list into an iterator of covered single-base loci + * completely contained within the interval list. To do this, it creates a LocusIteratorByState which will emit a single-bp + * locus for every base covered by the read iterator, then uses the WindowMakerIterator.advance() to filter down that stream of + * loci to only those covered by the given interval list. + * + * Example: + * Incoming stream of reads: A:chr20:1-5, B:chr20:2-6, C:chr20:2-7, D:chr20:3-8, E:chr20:5-10 + * Incoming intervals: chr20:3-7 + * + * Locus iterator by state will produce the following stream of data: + * chr1:1 {A}, chr1:2 {A,B,C}, chr1:3 {A,B,C,D}, chr1:4 {A,B,C,D}, chr1:5 {A,B,C,D,E}, + * chr1:6 {B,C,D,E}, chr1:7 {C,D,E}, chr1:8 {D,E}, chr1:9 {E}, chr1:10 {E} + * + * WindowMakerIterator will then filter the incoming stream, emitting the following stream: + * chr1:3 {A,B,C,D}, chr1:4 {A,B,C,D}, chr1:5 {A,B,C,D,E}, chr1:6 {B,C,D,E}, chr1:7 {C,D,E} + * + * @author mhanna + * @version 0.1 + */ +public class WindowMaker implements Iterable, Iterator { + /** + * Source information for iteration. + */ + private final ReadProperties sourceInfo; + + /** + * Hold the read iterator so that it can be closed later. + */ + private final GATKSAMRecordIterator readIterator; + + /** + * The data source for reads. Will probably come directly from the BAM file. + */ + private final PeekableIterator sourceIterator; + + /** + * Stores the sequence of intervals that the windowmaker should be tracking. + */ + private final PeekableIterator intervalIterator; + + /** + * In the case of monolithic sharding, this case returns whether the only shard has been generated. + */ + private boolean shardGenerated = false; + + /** + * The alignment context to return from this shard's iterator. Lazy implementation: the iterator will not find the + * currentAlignmentContext until absolutely required to do so. If currentAlignmentContext is null and advance() + * doesn't populate it, no more elements are available. If currentAlignmentContext is non-null, currentAlignmentContext + * should be returned by next(). + */ + private AlignmentContext currentAlignmentContext; + + /** + * Create a new window maker with the given iterator as a data source, covering + * the given intervals. + * @param iterator The data source for this window. + * @param intervals The set of intervals over which to traverse. + * @param sampleNames The complete set of sample names in the reads in shard + */ + + private final LocusIteratorByState libs; + + public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, GATKSAMIterator iterator, List intervals, Collection sampleNames) { + this.sourceInfo = shard.getReadProperties(); + this.readIterator = new GATKSAMRecordIterator(iterator); + + this.libs = new LocusIteratorByState(readIterator, + sourceInfo.getDownsamplingMethod(), sourceInfo.includeReadsWithDeletionAtLoci(), + sourceInfo.keepUniqueReadListInLIBS(), genomeLocParser,sampleNames); + this.sourceIterator = new PeekableIterator(libs); + + this.intervalIterator = intervals.size()>0 ? new PeekableIterator(intervals.iterator()) : null; + } + + public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, GATKSAMIterator iterator, List intervals ) { + this(shard, genomeLocParser, iterator, intervals, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); + } + + public Iterator iterator() { + return this; + } + + public boolean hasNext() { + return (intervalIterator != null && intervalIterator.hasNext()) || !shardGenerated; + } + + public WindowMakerIterator next() { + shardGenerated = true; + return new WindowMakerIterator(intervalIterator != null ? intervalIterator.next() : null); + } + + public void remove() { + throw new UnsupportedOperationException("Cannot remove from a window maker."); + } + + public void close() { + this.readIterator.close(); + } + + public class WindowMakerIterator extends LocusIterator { + /** + * The locus for which this iterator is currently returning reads. + */ + private final GenomeLoc locus; + + public WindowMakerIterator(GenomeLoc locus) { + this.locus = locus; + advance(); + } + + public ReadProperties getSourceInfo() { + return sourceInfo; + } + + public GenomeLoc getLocus() { + return locus; + } + + public WindowMakerIterator iterator() { + return this; + } + + public boolean hasNext() { + advance(); + return currentAlignmentContext != null; + } + + public AlignmentContext next() { + if(!hasNext()) throw new NoSuchElementException("WindowMakerIterator is out of elements for this interval."); + + // Consume this alignment context. + AlignmentContext toReturn = currentAlignmentContext; + currentAlignmentContext = null; + + // Return the current element. + return toReturn; + } + + private void advance() { + // Need to find the next element that is not past shard boundaries. If we travel past the edge of + // shard boundaries, stop and let the next interval pick it up. + while(currentAlignmentContext == null && sourceIterator.hasNext()) { + // Advance the iterator and try again. + AlignmentContext candidateAlignmentContext = sourceIterator.peek(); + + if(locus == null) { + // No filter present. Return everything that LocusIteratorByState provides us. + currentAlignmentContext = sourceIterator.next(); + } + else if(locus.isPast(candidateAlignmentContext.getLocation())) + // Found a locus before the current window; claim this alignment context and throw it away. + sourceIterator.next(); + else if(locus.containsP(candidateAlignmentContext.getLocation())) { + // Found a locus within the current window; claim this alignment context and call it the next entry. + currentAlignmentContext = sourceIterator.next(); + } + else if(locus.isBefore(candidateAlignmentContext.getLocation())) { + // Whoops. Skipped passed the end of the region. Iteration for this window is complete. Do + // not claim this alignment context in case it is part of the next shard. + break; + } + else + throw new ReviewedGATKException("BUG: filtering locus does not contain, is not before, and is not past the given alignment context"); + } + } + + @Override + public LocusIteratorByState getLIBS() { + return libs; + } + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/package-info.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/package-info.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/package-info.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BAQReadTransformer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BAQReadTransformer.java new file mode 100644 index 000000000..f0e889a63 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BAQReadTransformer.java @@ -0,0 +1,75 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.WalkerManager; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; +import org.broadinstitute.gatk.engine.walkers.BAQMode; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.baq.BAQ; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +/** + * Applies Heng's BAQ calculation to a stream of incoming reads + */ +public class BAQReadTransformer extends ReadTransformer { + private BAQ baqHMM; + private IndexedFastaSequenceFile refReader; + private BAQ.CalculationMode cmode; + private BAQ.QualityMode qmode; + + @Override + public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { + final BAQMode mode = WalkerManager.getWalkerAnnotation(walker, BAQMode.class); + this.refReader = engine.getReferenceDataSource().getReference(); + this.cmode = engine.getArguments().BAQMode; + this.qmode = mode.QualityMode(); + baqHMM = new BAQ(engine.getArguments().BAQGOP); + + if ( qmode == BAQ.QualityMode.DONT_MODIFY ) + throw new ReviewedGATKException("BUG: shouldn't create BAQ transformer with quality mode DONT_MODIFY"); + + if ( mode.ApplicationTime() == ReadTransformer.ApplicationTime.FORBIDDEN && enabled() ) + throw new UserException.BadArgumentValue("baq", "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + cmode + " was requested."); + + return mode.ApplicationTime(); + } + + @Override + public boolean enabled() { + return cmode != BAQ.CalculationMode.OFF; + } + + @Override + public GATKSAMRecord apply(final GATKSAMRecord read) { + baqHMM.baqRead(read, refReader, cmode, qmode); + return read; + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadCigarFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadCigarFilter.java new file mode 100644 index 000000000..0b7d1a905 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadCigarFilter.java @@ -0,0 +1,141 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.Cigar; +import htsjdk.samtools.CigarElement; +import htsjdk.samtools.CigarOperator; +import htsjdk.samtools.SAMRecord; + +import java.util.Iterator; + +/** + * Filter out reads with wonky CIGAR strings + * + *

    This read filter will filter out the following cases:

    + *
      + *
    • different length and cigar length
    • + *
    • Hard/Soft clips in the middle of the cigar
    • + *
    • starting with deletions (with or without preceding clips)
    • + *
    • ending in deletions (with or without follow-up clips)
    • + *
    • fully hard or soft clipped
    • + *
    • consecutive indels in the cigar (II, DD, ID or DI)
    • + *
    + * + *

    Usage example

    + * + *

    Enable the bad cigar filter

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf BadCigar
    + * 
    + * + * @author ebanks + * @version 0.1 + */ + +public class BadCigarFilter extends ReadFilter { + + public boolean filterOut(final SAMRecord rec) { + final Cigar c = rec.getCigar(); + + // if there is no Cigar then it can't be bad + if( c.isEmpty() ) { + return false; + } + + // Read and it's CIGAR not the same length + if ( rec.getReadLength() != c.getReadLength() ) { + return true; + } + + Iterator elementIterator = c.getCigarElements().iterator(); + + CigarOperator firstOp = CigarOperator.H; + while (elementIterator.hasNext() && (firstOp == CigarOperator.H || firstOp == CigarOperator.S)) { + CigarOperator op = elementIterator.next().getOperator(); + + // No reads with Hard/Soft clips in the middle of the cigar + if (firstOp != CigarOperator.H && op == CigarOperator.H) { + return true; + } + firstOp = op; + } + + // No reads starting with deletions (with or without preceding clips) + if (firstOp == CigarOperator.D) { + return true; + } + + boolean hasMeaningfulElements = (firstOp != CigarOperator.H && firstOp != CigarOperator.S); + boolean previousElementWasIndel = firstOp == CigarOperator.I; + CigarOperator lastOp = firstOp; + CigarOperator previousOp = firstOp; + + while (elementIterator.hasNext()) { + CigarOperator op = elementIterator.next().getOperator(); + + if (op != CigarOperator.S && op != CigarOperator.H) { + + // No reads with Hard/Soft clips in the middle of the cigar + if (previousOp == CigarOperator.S || previousOp == CigarOperator.H) + return true; + + lastOp = op; + + if (!hasMeaningfulElements && op.consumesReadBases()) { + hasMeaningfulElements = true; + } + + if (op == CigarOperator.I || op == CigarOperator.D) { + + // No reads that have consecutive indels in the cigar (II, DD, ID or DI) + if (previousElementWasIndel) { + return true; + } + previousElementWasIndel = true; + } + else { + previousElementWasIndel = false; + } + } + // No reads with Hard/Soft clips in the middle of the cigar + else if (op == CigarOperator.S && previousOp == CigarOperator.H) { + return true; + } + + previousOp = op; + } + + // No reads ending in deletions (with or without follow-up clips) + // No reads that are fully hard or soft clipped + return lastOp == CigarOperator.D || !hasMeaningfulElements; + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadMateFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadMateFilter.java new file mode 100644 index 000000000..562e50ea9 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadMateFilter.java @@ -0,0 +1,74 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMRecord; + +/** + * Filter out reads whose mate maps to a different contig + * + *

    This filter is intended to ensure that only reads that are likely to be mapped in the right place, and therefore + * to be informative, will be used in analysis. If mates in a pair are mapping to different contigs, it is likely that + * at least one of them is in the wrong place. One exception is you are using a draft genome assembly in which the + * chromosomes are fragmented into many contigs; then you may legitimately have reads that are correctly mapped but are + * on different contigs than their mate. This read filter can be disabled from the command line using the -drf argument. + *

    + * + *

    Enable the bad mate filter

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf BadMate
    + * 
    + * + *

    Disable the bad mate filter

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -drf BadMate
    + * 
    + * + * @author ebanks + * @version 0.1 + */ + +public class BadMateFilter extends DisableableReadFilter { + + public boolean filterOut(final SAMRecord rec) { + return hasBadMate(rec); + } + + public static boolean hasBadMate(final SAMRecord rec) { + return (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag() && !rec.getReferenceIndex().equals(rec.getMateReferenceIndex())); + } + +} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/CountingFilteringIterator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/CountingFilteringIterator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/CountingFilteringIterator.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/CountingFilteringIterator.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/DisableableReadFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/DisableableReadFilter.java new file mode 100644 index 000000000..9c44be3de --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/DisableableReadFilter.java @@ -0,0 +1,35 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; +import org.broadinstitute.gatk.utils.help.HelpConstants; + +@DocumentedGATKFeature( + groupName = HelpConstants.DOCS_CAT_RF, + summary = "A ReadFilter which can be disabled by using the --disable_read_filter parameter" ) +public abstract class DisableableReadFilter extends ReadFilter { +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/DuplicateReadFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/DuplicateReadFilter.java new file mode 100644 index 000000000..310f1dee3 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/DuplicateReadFilter.java @@ -0,0 +1,91 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMRecord; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Filter out duplicate reads + * + *

    This filter recognizes the SAM flag set by MarkDuplicates. It can be disabled from the command line if needed + * using the -drf argument.

    + * + *

    Usage examples

    + * + *

    Enable the duplicate read filter

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf DuplicateRead
    + * 
    + * + *

    Disable the duplicate read filter

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -drf DuplicateRead
    + * 
    + * + * @author rpoplin + * @since Dec 9, 2009 + */ + +public class DuplicateReadFilter extends DisableableReadFilter { + public boolean filterOut( final SAMRecord read ) { + return read.getDuplicateReadFlag(); + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/FailsVendorQualityCheckFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/FailsVendorQualityCheckFilter.java new file mode 100644 index 000000000..fc5cdcb53 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/FailsVendorQualityCheckFilter.java @@ -0,0 +1,43 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMRecord; + +/** + * Filter out reads that fail the vendor quality check + * + *

    This filter recognizes the SAM flag corresponding to the vendor quality check.

    + * + * @author rpoplin + * @since Jul 19, 2010 + */ + +public class FailsVendorQualityCheckFilter extends ReadFilter { + public boolean filterOut( final SAMRecord read ) { + return read.getReadFailsVendorQualityCheckFlag(); + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/FilterManager.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/FilterManager.java new file mode 100644 index 000000000..90d8a3fd8 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/FilterManager.java @@ -0,0 +1,106 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import org.broadinstitute.gatk.utils.classloader.PluginManager; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.help.GATKDocUtils; +import org.broadinstitute.gatk.utils.help.HelpConstants; + +import java.util.Collection; +import java.util.List; + +/** + * Manage filters and filter options. Any requests for basic filtering classes + * should ultimately be made through this class. + * + * @author mhanna + * @version 0.1 + */ +public class FilterManager extends PluginManager { + public FilterManager() { + super(ReadFilter.class,"filter","Filter"); + } + + /** + * Instantiate a filter of the given type. Along the way, scream bloody murder if + * the filter is not available. + * @param filterType The type of the filter + * @return The filter + */ + public ReadFilter createFilterByType(Class filterType) { + return this.createByName(getName(filterType)); + } + + public Collection> getValues() { + return this.getPlugins(); + } + + /** + * Rather than use the default error message, print out a list of read filters as well. + * @param pluginCategory - string, the category of the plugin (e.g. read filter) + * @param pluginName - string, what we were trying to match (but failed to) + * @return - A wall of text with the default message, followed by a listing of available read filters + */ + @Override + protected String formatErrorMessage(String pluginCategory, String pluginName) { + List> availableFilters = this.getPluginsImplementing(ReadFilter.class); + + + return String.format("Read filter %s not found. Available read filters:%n%n%s%n%n%s",pluginName, + userFriendlyListofReadFilters(availableFilters), + "Please consult the GATK Documentation (" + HelpConstants.GATK_DOCS_URL + ") for more information."); + } + + /** + * Rather than use the default exception, return a MalformedReadFilterException. + * @param errorMessage error message from formatErrorMessage() + * @return - A MalformedReadFilterException with errorMessage + */ + @Override + protected UserException createMalformedArgumentException(final String errorMessage) { + return new UserException.MalformedReadFilterException(errorMessage); + } + + private String userFriendlyListofReadFilters(List> filters) { + final String headName = "FilterName", headDoc = "Documentation"; + int longestNameLength = -1; + for ( Class < ? extends ReadFilter> filter : filters ) { + longestNameLength = Math.max(longestNameLength,this.getName(filter).length()); + } + String format = " %"+longestNameLength+"s %s%n"; + + StringBuilder listBuilder = new StringBuilder(); + listBuilder.append(String.format(format,headName,headDoc)); + for ( Class filter : filters ) { + String helpLink = GATKDocUtils.helpLinksToGATKDocs(filter); + String filterName = this.getName(filter); + listBuilder.append(String.format(format,filterName,helpLink)); + } + + return listBuilder.toString(); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/LibraryReadFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/LibraryReadFilter.java new file mode 100644 index 000000000..d6e78a616 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/LibraryReadFilter.java @@ -0,0 +1,65 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMReadGroupRecord; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.engine.filters.ReadFilter; + +/** + * Only use reads from the specified library + * + *

    This filter is useful for running on only a subset of the data as identified by a read group property. + * In the case of the library filter, the goal is usually to run quality control checks on a particular library.

    + * + *

    Usage example

    + * + *

    Enable the library read filter

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf LibraryRead \
    + *         -library library_name
    + * 
    + * + * @author kcibul + * @since Aug 15, 2012 + * + */ + +public class LibraryReadFilter extends ReadFilter { + @Argument(fullName = "library", shortName = "library", doc="The name of the library to keep, filtering out all others", required=true) + private String LIBRARY_TO_KEEP = null; + + public boolean filterOut( final SAMRecord read ) { + final SAMReadGroupRecord readGroup = read.getReadGroup(); + return ( readGroup == null || readGroup.getLibrary() == null || !readGroup.getLibrary().equals( LIBRARY_TO_KEEP ) ); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilter.java new file mode 100644 index 000000000..05c6f564e --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilter.java @@ -0,0 +1,277 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.*; +import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.ReadProperties; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +/** + * Filter out malformed reads + * + *

    This filter is applied automatically by all GATK tools in order to protect them from crashing on reads that are + * grossly malformed. There are a few issues (such as the absence of sequence bases) that will cause the run to fail with an + * error, but these cases can be preempted by setting flags that cause the problem reads to also be filtered.

    + * + *

    Usage example

    + * + *

    Set the malformed read filter to filter out reads that have no sequence bases

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -filterNoBases
    + * 
    + * + *

    Note that the MalformedRead filter itself does not need to be specified in the command line because it is set + * automatically.

    + * + * @author mhanna + * @version 0.1 + */ +public class MalformedReadFilter extends ReadFilter { + + + private static final String FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME = "filter_reads_with_N_cigar" ; + + private SAMFileHeader header; + + @Argument(fullName = FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME, shortName = "filterRNC", doc = "Filter out reads with CIGAR containing the N operator, instead of failing with an error", required = false) + boolean filterReadsWithNCigar = false; + + + @Argument(fullName = "filter_mismatching_base_and_quals", shortName = "filterMBQ", doc = "Filter out reads with mismatching numbers of bases and base qualities, instead of failing with an error", required = false) + boolean filterMismatchingBaseAndQuals = false; + + @Argument(fullName = "filter_bases_not_stored", shortName = "filterNoBases", doc = "Filter out reads with no stored bases (i.e. '*' where the sequence should be), instead of failing with an error", required = false) + boolean filterBasesNotStored = false; + + /** + * Indicates the applicable validation exclusions + */ + private boolean allowNCigars; + + @Override + public void initialize(final GenomeAnalysisEngine engine) { + header = engine.getSAMFileHeader(); + ValidationExclusion validationExclusions = null; + final SAMDataSource rds = engine.getReadsDataSource(); + if (rds != null) { + final ReadProperties rps = rds.getReadsInfo(); + if (rps != null) { + validationExclusions = rps.getValidationExclusionList(); + } + } + if (validationExclusions == null) { + allowNCigars = false; + } else { + allowNCigars = validationExclusions.contains(ValidationExclusion.TYPE.ALLOW_N_CIGAR_READS); + } + } + + public boolean filterOut(final SAMRecord read) { + // slowly changing the behavior to blow up first and filtering out if a parameter is explicitly provided + return !checkInvalidAlignmentStart(read) || + !checkInvalidAlignmentEnd(read) || + !checkAlignmentDisagreesWithHeader(this.header,read) || + !checkHasReadGroup(read) || + !checkMismatchingBasesAndQuals(read, filterMismatchingBaseAndQuals) || + !checkCigarDisagreesWithAlignment(read) || + !checkSeqStored(read, filterBasesNotStored) || + !checkCigarIsSupported(read,filterReadsWithNCigar,allowNCigars); + } + + private static boolean checkHasReadGroup(final SAMRecord read) { + if ( read.getReadGroup() == null ) { + // there are 2 possibilities: either the RG tag is missing or it is not defined in the header + final String rgID = (String)read.getAttribute(SAMTagUtil.getSingleton().RG); + if ( rgID == null ) + throw new UserException.ReadMissingReadGroup(read); + throw new UserException.ReadHasUndefinedReadGroup(read, rgID); + } + return true; + } + + /** + * Check for the case in which the alignment start is inconsistent with the read unmapped flag. + * @param read The read to validate. + * @return true if read start is valid, false otherwise. + */ + private static boolean checkInvalidAlignmentStart(final SAMRecord read ) { + // read is not flagged as 'unmapped', but alignment start is NO_ALIGNMENT_START + if( !read.getReadUnmappedFlag() && read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START ) + return false; + // Read is not flagged as 'unmapped', but alignment start is -1 + if( !read.getReadUnmappedFlag() && read.getAlignmentStart() == -1 ) + return false; + return true; + } + + /** + * Check for invalid end of alignments. + * @param read The read to validate. + * @return true if read end is valid, false otherwise. + */ + private static boolean checkInvalidAlignmentEnd(final SAMRecord read ) { + // Alignment aligns to negative number of bases in the reference. + if( !read.getReadUnmappedFlag() && read.getAlignmentEnd() != -1 && (read.getAlignmentEnd()-read.getAlignmentStart()+1)<0 ) + return false; + return true; + } + + /** + * Check to ensure that the alignment makes sense based on the contents of the header. + * @param header The SAM file header. + * @param read The read to verify. + * @return true if alignment agrees with header, false othrewise. + */ + private static boolean checkAlignmentDisagreesWithHeader(final SAMFileHeader header, final SAMRecord read ) { + // Read is aligned to nonexistent contig + if( read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) + return false; + final SAMSequenceRecord contigHeader = header.getSequence( read.getReferenceIndex() ); + // Read is aligned to a point after the end of the contig + if( !read.getReadUnmappedFlag() && read.getAlignmentStart() > contigHeader.getSequenceLength() ) + return false; + return true; + } + + /** + * Check for inconsistencies between the cigar string and the + * @param read The read to validate. + * @return true if cigar agrees with alignment, false otherwise. + */ + private static boolean checkCigarDisagreesWithAlignment(final SAMRecord read) { + // Read has a valid alignment start, but the CIGAR string is empty + if( !read.getReadUnmappedFlag() && + read.getAlignmentStart() != -1 && + read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START && + read.getAlignmentBlocks().size() < 0 ) + return false; + return true; + } + + /** + * Check for unsupported CIGAR operators. + * Currently the N operator is not supported. + * @param read The read to validate. + * @param filterReadsWithNCigar whether the offending read should just + * be silently filtered or not. + * @param allowNCigars whether reads that contain N operators in their CIGARs + * can be processed or an exception should be thrown instead. + * @throws UserException.UnsupportedCigarOperatorException + * if {@link #filterReadsWithNCigar} is false and + * the input read has some unsupported operation. + * @return true if the read CIGAR operations are + * fully supported, otherwise false, as long as + * no exception has been thrown. + */ + private static boolean checkCigarIsSupported(final SAMRecord read, final boolean filterReadsWithNCigar, final boolean allowNCigars) { + if( containsNOperator(read)) { + if (! filterReadsWithNCigar && !allowNCigars) { + throw new UserException.UnsupportedCigarOperatorException( + CigarOperator.N,read, + "Perhaps you are" + + " trying to use RNA-Seq data?" + + " While we are currently actively working to" + + " support this data type unfortunately the" + + " GATK cannot be used with this data in its" + + " current form. You have the option of either" + + " filtering out all reads with operator " + + CigarOperator.N + " in their CIGAR string" + + " (please add --" + + FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME + + " to your command line) or" + + " assume the risk of processing those reads as they" + + " are including the pertinent unsafe flag (please add -U" + + ' ' + ValidationExclusion.TYPE.ALLOW_N_CIGAR_READS + + " to your command line). Notice however that if you were" + + " to choose the latter, an unspecified subset of the" + + " analytical outputs of an unspecified subset of the tools" + + " will become unpredictable. Consequently the GATK team" + + " might well not be able to provide you with the usual support" + + " with any issue regarding any output"); + } + return ! filterReadsWithNCigar; + } + return true; + } + + private static boolean containsNOperator(final SAMRecord read) { + final Cigar cigar = read.getCigar(); + if (cigar == null) { + return false; + } + for (final CigarElement ce : cigar.getCigarElements()) { + if (ce.getOperator() == CigarOperator.N) { + return true; + } + } + return false; + } + + /** + * Check if the read has the same number of bases and base qualities + * @param read the read to validate + * @return true if they have the same number. False otherwise. + */ + private static boolean checkMismatchingBasesAndQuals(final SAMRecord read, final boolean filterMismatchingBaseAndQuals) { + final boolean result; + if (read.getReadLength() == read.getBaseQualities().length) + result = true; + else if (filterMismatchingBaseAndQuals) + result = false; + else + throw new UserException.MalformedBAM(read, + String.format("BAM file has a read with mismatching number of bases and base qualities. Offender: %s [%d bases] [%d quals].%s", + read.getReadName(), read.getReadLength(), read.getBaseQualities().length, + read.getBaseQualities().length == 0 ? " You can use --defaultBaseQualities to assign a default base quality for all reads, but this can be dangerous in you don't know what you are doing." : "")); + + return result; + } + + /** + * Check if the read has its base sequence stored + * @param read the read to validate + * @return true if the sequence is stored and false otherwise ("*" in the SEQ field). + */ + protected static boolean checkSeqStored(final SAMRecord read, final boolean filterBasesNotStored) { + + if ( read.getReadBases() != SAMRecord.NULL_SEQUENCE ) + return true; + + if ( filterBasesNotStored ) + return false; + + throw new UserException.MalformedBAM(read, String.format("the BAM file has a read with no stored bases (i.e. it uses '*') which is not supported in the GATK; see the --filter_bases_not_stored argument. Offender: %s", read.getReadName())); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityFilter.java new file mode 100644 index 000000000..58ec76660 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityFilter.java @@ -0,0 +1,62 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.commandline.Argument; + +/** + * Filter out reads with low mapping qualities + * + *

    This filter is intended to ensure that only reads that are likely + * to be mapped in the right place, and therefore to be informative, will be used in analysis.

    + * + *

    Usage example

    + * + *

    Set the mapping quality filter to filter out reads that have MAPQ < 15

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T HaplotypeCaller \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.vcf \
    + *         -rf MappingQuality \
    + *         -mmq 15
    + * 
    + * + * @author ebanks + * @version 0.1 + */ + +public class MappingQualityFilter extends ReadFilter { + + @Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling", required = false) + public int MIN_MAPPING_QUALTY_SCORE = 10; + + public boolean filterOut(SAMRecord rec) { + return (rec.getMappingQuality() < MIN_MAPPING_QUALTY_SCORE); + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityUnavailableFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityUnavailableFilter.java new file mode 100644 index 000000000..ff1542e41 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityUnavailableFilter.java @@ -0,0 +1,58 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.QualityUtils; + +/** + * Filter out reads with no mapping quality information + * + * + *

    This filter is intended to ensure that only reads that are likely + * to be mapped in the right place, and therefore to be informative, will be used in analysis.

    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf MappingQualityUnavailable
    + * 
    + * + * @author ebanks + * @version 0.1 + */ + +public class MappingQualityUnavailableFilter extends ReadFilter { + public boolean filterOut(SAMRecord rec) { + return (rec.getMappingQuality() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE); + } +} + diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityZeroFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityZeroFilter.java new file mode 100644 index 000000000..b0d40c074 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityZeroFilter.java @@ -0,0 +1,56 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMRecord; + +/** + * Filter out reads with mapping quality zero + * + *

    This filter is intended to ensure that only reads that are likely + * to be mapped in the right place, and therefore to be informative, will be used in analysis.

    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf MappingQualityZero
    + * 
    + * + * @author hanna + * @version 0.1 + */ + +public class MappingQualityZeroFilter extends ReadFilter { + public boolean filterOut(SAMRecord rec) { + return (rec.getMappingQuality() == 0); + } +} + diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MateSameStrandFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MateSameStrandFilter.java new file mode 100644 index 000000000..20dda5427 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MateSameStrandFilter.java @@ -0,0 +1,65 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMRecord; + +/** + * Filter out reads with bad pairing (and related) properties + * + *

    This filter is intended to ensure that only reads that are likely + * to be mapped in the right place, and therefore to be informative, will be used in analysis. + * The following cases will be filtered out: + *

    + *
      + *
    • is not paired
    • + *
    • mate is unmapped
    • + *
    • is duplicate
    • + *
    • fails vendor quality check
    • + *
    • both mate and read are in the same strand orientation
    • + *
    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf MateSameStrand
    + * 
    + * + * @author chartl + * @since 5/18/11 + */ +public class MateSameStrandFilter extends ReadFilter { + + public boolean filterOut(SAMRecord read) { + return (! read.getReadPairedFlag() ) || read.getMateUnmappedFlag() || read.getDuplicateReadFlag() || + read.getReadFailsVendorQualityCheckFlag() || (read.getMateNegativeStrandFlag() == read.getReadNegativeStrandFlag()); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MaxInsertSizeFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MaxInsertSizeFilter.java new file mode 100644 index 000000000..c7b512f2b --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MaxInsertSizeFilter.java @@ -0,0 +1,59 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.commandline.Argument; + +/** + * Filter out reads that exceed a given insert size + * + *

    This filter is intended to ensure that only reads that are likely + * to be mapped in the right place, and therefore to be informative, will be used in analysis.

    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf MaxInsertSize \
    + *         -maxInsert 10000
    + * 
    + * + * @author chartl + * @since 5/2/11 + */ +public class MaxInsertSizeFilter extends ReadFilter { + @Argument(fullName = "maxInsertSize", shortName = "maxInsert", doc="Insert size cutoff", required=false) + private int maxInsertSize = 1000000; + + public boolean filterOut(SAMRecord record) { + return (record.getReadPairedFlag() && (record.getInferredInsertSize() > maxInsertSize || record.getInferredInsertSize() < -1*maxInsertSize)); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MissingReadGroupFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MissingReadGroupFilter.java new file mode 100644 index 000000000..0a7a2cdbf --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MissingReadGroupFilter.java @@ -0,0 +1,55 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMRecord; + +/** + * Filter out reads without read group information + * + *

    Many GATK tools are dependent on having read group information in order to operate correctly. This filter excludes + * any reads that have not been appropriately identified.

    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf MissingReadGroup
    + * 
    + * + * @author ebanks + * @version 0.1 + */ + +public class MissingReadGroupFilter extends ReadFilter { + public boolean filterOut(SAMRecord rec) { + return rec.getReadGroup() == null; + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformer.java new file mode 100644 index 000000000..e01827723 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformer.java @@ -0,0 +1,123 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.Cigar; +import htsjdk.samtools.CigarElement; +import htsjdk.samtools.CigarOperator; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.iterators.RNAReadTransformer; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +/** + * Reduce NDN cigar elements to one N element. + * + *

    This read transformer will refactor cigar strings that contain N-D-N elements to one N element (with total length + * of the three refactored elements). The engine parameter that activate this read transformer is + * `--refactor_NDN_cigar_string` / `-fixNDN`

    + * + *

    Rationale

    + *

    Some RNAseq aligners that use a known transcriptome resource (such as TopHat2) produce NDN elements in read CIGARS + * when a small exon is entirely deleted during transcription, which ends up looking like [exon1]NDN[exon3]. Currently + * we consider that the internal N-D-N motif is illegal and we error out when we encounter it. By refactoring the cigar string of + * those specific reads, this read transformer allows users of TopHat and other tools to circumvent this problem without + * affecting the rest of their dataset. From the point of view of variant calling, there is no meaningful difference between + * the two representations.

    + * + *

    Developer notes

    + *
      + *
    • Any walker that needs to apply this functionality should apply that read transformer in its map function, since it won't be activated by the GATK engine.
    • + *
    + * + * + * @author ami + * @since 04/22/14 + */ + +public class NDNCigarReadTransformer extends RNAReadTransformer { + + private boolean refactorReads; + + @Override + public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { + refactorReads = engine.getArguments().REFACTOR_NDN_CIGAR_READS; + + return ApplicationTime.HANDLED_IN_WALKER; // NOTE: any walker that need that functionality should apply that read transformer in its map function, since it won't be activated by the GATK engine. + } + + @Override + public GATKSAMRecord apply(final GATKSAMRecord read) { + if(read == null) + throw new UserException.BadInput("try to transform a null GATKSAMRecord"); + final Cigar originalCigar = read.getCigar(); + if (originalCigar.isValid(read.getReadName(),-1) != null) + throw new UserException.BadInput("try to transform a read with non-valid cigar string: readName: "+read.getReadName()+" Cigar String: "+originalCigar); + read.setCigar(refactorNDNtoN(originalCigar)); + return read; + } + + @Override + public boolean enabled() { + return refactorReads; + } + + + + protected Cigar refactorNDNtoN(final Cigar originalCigar) { + final Cigar refactoredCigar = new Cigar(); + final int cigarLength = originalCigar.numCigarElements(); + for(int i = 0; i < cigarLength; i++){ + final CigarElement element = originalCigar.getCigarElement(i); + if(element.getOperator() == CigarOperator.N && thereAreAtLeast2MoreElements(i,cigarLength)){ + final CigarElement nextElement = originalCigar.getCigarElement(i+1); + final CigarElement nextNextElement = originalCigar.getCigarElement(i+2); + + // if it is N-D-N replace with N (with the total length) otherwise just add the first N. + if(nextElement.getOperator() == CigarOperator.D && nextNextElement.getOperator() == CigarOperator.N){ + final int threeElementsLength = element.getLength() + nextElement.getLength() + nextNextElement.getLength(); + final CigarElement refactoredElement = new CigarElement(threeElementsLength,CigarOperator.N); + refactoredCigar.add(refactoredElement); + i += 2; //skip the elements that were refactored + } + else + refactoredCigar.add(element); // add only the first N + } + else + refactoredCigar.add(element); // add any non-N element + } + return refactoredCigar; + } + + private boolean thereAreAtLeast2MoreElements(final int index, final int cigarLength){ + return index < cigarLength - 2; + } + +} + diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NoOriginalQualityScoresFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NoOriginalQualityScoresFilter.java new file mode 100644 index 000000000..4e8a1dc2b --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NoOriginalQualityScoresFilter.java @@ -0,0 +1,54 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMRecord; + + +/** + * Filter out reads that do not have an original quality quality score (OQ) tag + * + *

    The OQ tag can be added during the base recalibration process in order to preserve original information.

    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf NoOriginalQualityScores
    + * 
    + * + * @author rpoplin + * @since Nov 19, 2009 + */ +public class NoOriginalQualityScoresFilter extends ReadFilter { + public boolean filterOut( final SAMRecord read ) { + return (read.getAttribute("OQ") == null); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NotPrimaryAlignmentFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NotPrimaryAlignmentFilter.java new file mode 100644 index 000000000..55a697d3e --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NotPrimaryAlignmentFilter.java @@ -0,0 +1,56 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMRecord; + +/** + * Filter out reads that are secondary alignments + * + *

    This filter recognizes the SAM flag that identifies secondary alignments (ie not the best alignment). + * It is intended to ensure that only reads that are likely to be mapped in the right place, and therefore to be + * informative, will be used in analysis.

    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf NotPrimaryAlignment
    + * 
    + * + * @author rpoplin + * @since Dec 9, 2009 + */ + +public class NotPrimaryAlignmentFilter extends ReadFilter { + public boolean filterOut( final SAMRecord read ) { + return read.getNotPrimaryAlignmentFlag(); + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/Platform454Filter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/Platform454Filter.java new file mode 100644 index 000000000..f1b375835 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/Platform454Filter.java @@ -0,0 +1,57 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.ReadUtils; + +/** + * Filter out reads produced by 454 technology + * + *

    Reads produced by 454 technology should not be processed by the GATK's indel realignment tools. This filter is + * applied by those tools to enforce that rule.

    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf Platform454
    + * 
    + * + * @author ebanks + * @version 0.1 + */ + +public class Platform454Filter extends ReadFilter { + public boolean filterOut(SAMRecord rec) { + return (ReadUtils.is454Read((GATKSAMRecord)rec)); + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformFilter.java new file mode 100644 index 000000000..7ca07d35d --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformFilter.java @@ -0,0 +1,65 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.ReadUtils; + +/** + * Filter out reads that were generated by a specific sequencing platform + * + *

    This filter is useful for running on only a subset of the data as identified by a read group property. + * In the case of the platform filter, the goal is usually to blacklist certain sequencing technologies at certain processing steps + * if we know there is an incompatibility problem (like 454 and indel realignment, which is special-cased).

    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf Platform \
    + *         -PLFilterName platform_name
    + * 
    + * + * @author ebanks + * @version 0.1 + */ +public class PlatformFilter extends ReadFilter { + @Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this string", required=false) + protected String[] PLFilterNames; + + public boolean filterOut(SAMRecord rec) { + for ( String name : PLFilterNames ) + if ( ReadUtils.isPlatformRead((GATKSAMRecord)rec, name.toUpperCase() )) + return true; + return false; + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilter.java new file mode 100644 index 000000000..b0e0bbebb --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilter.java @@ -0,0 +1,90 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMReadGroupRecord; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +import java.util.HashSet; +import java.util.Set; + +/** + * Filter out reads with blacklisted platform unit tags + * + *

    This filter is useful for running on only a subset of the data as identified by a read group property. + * In the case of the platform unit filter, the goal is usually to blacklist certain runs if we know there was a problem with + * a particular sequencing machine.

    + * + * @author asivache + * @since Sep 21, 2009 + */ +public class PlatformUnitFilter extends ReadFilter { + // a hack: use static in order to be able to fill it with the data from command line at runtime + static private Set blackListedLanes = new HashSet(); + + public boolean filterOut(SAMRecord samRecord) { + + if ( blackListedLanes.size() == 0 ) return false; // no filters set, nothing to do + + Object pu_attr = samRecord.getAttribute("PU"); + + if ( pu_attr == null ) { + // no platform unit in the record, go get from read group + SAMReadGroupRecord rgr = samRecord.getReadGroup(); + if ( rgr == null ) throw new UserException.MalformedBAM(samRecord, "Read " + samRecord.getReadName() +" has NO associated read group record"); + pu_attr = rgr.getAttribute("PU") ; + } + if ( pu_attr == null ) return false; // could not get PU, forget about the filtering... + return blackListedLanes.contains((String)pu_attr); + } + + /** + * The argument is interpreted as a comma-separated list of lanes (platform units) to be filtered + * out. All the specified names will be registered with the filter and filterOut(r) for any SAMRecord r + * belonging to one of the specified lanes will thereafter return true. + * The names can be surrounded by additional spaces, the latters will be trimmed by this method. + * This method can be called multiple times to add more lanes. Re-registering the same lane again is safe. + * @param arg + */ + public static void setBlackListedLanes(String arg) { + String[] lanes = arg.split(","); + for ( int i = 0; i < lanes.length ; i++ ) { + blackListedLanes.add(lanes[i].trim()); + } + } + + /** + * Adds a single name of a lane (platform unit) to be filtered out by this filter. The name can be surrounded + * by spaces, the latters will be trimmed out. This method can be called multiple times to add more lanes. + * Re-registering the same lane again is safe. + * @param arg + */ + public static void addBlackListedLane(String arg) { + blackListedLanes.add(arg.trim()); + } + +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilterHelper.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilterHelper.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilterHelper.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilterHelper.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadFilter.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilter.java new file mode 100644 index 000000000..9f815cf72 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilter.java @@ -0,0 +1,133 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMReadGroupRecord; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.text.XReadLines; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; +import java.util.Map.Entry; + +/** + * Filter out reads matching a read group tag value + * + *

    This filter is useful for running on only a subset of the data as identified by a read group property, + * using expression matching against the read group tags.

    + * + *

    Usage example

    + * + *

    Set the read group filter to blacklist read groups that have the PU tag "1000G-mpimg-080821-1_1"

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf ReadGroupBlackList \
    + *         -rgbl PU:1000G-mpimg-080821-1_1
    + * 
    + */ +public class ReadGroupBlackListFilter extends ReadFilter { + private Set>> filterEntries; + + public ReadGroupBlackListFilter(List blackLists) { + Map> filters = new TreeMap>(); + for (String blackList : blackLists) + addFilter(filters, blackList, null, 0); + this.filterEntries = filters.entrySet(); + } + + public boolean filterOut(SAMRecord samRecord) { + for (Entry> filterEntry : filterEntries) { + String attributeType = filterEntry.getKey(); + + SAMReadGroupRecord samReadGroupRecord = samRecord.getReadGroup(); + if (samReadGroupRecord != null) { + Object attribute; + if ("ID".equals(attributeType) || "RG".equals(attributeType)) + attribute = samReadGroupRecord.getId(); + else + attribute = samReadGroupRecord.getAttribute(attributeType); + if (attribute != null && filterEntry.getValue().contains(attribute)) + return true; + } + } + + return false; + } + + private void addFilter(Map> filters, String filter, File parentFile, int parentLineNum) { + if (filter.toLowerCase().endsWith(".list") || filter.toLowerCase().endsWith(".txt")) { + File file = new File(filter); + try { + int lineNum = 0; + XReadLines lines = new XReadLines(file); + for (String line : lines) { + lineNum++; + + if (line.trim().length() == 0) + continue; + + if (line.startsWith("#")) + continue; + + addFilter(filters, line, file, lineNum); + } + } catch (FileNotFoundException e) { + String message = "Error loading black list: " + file.getAbsolutePath(); + if (parentFile != null) { + message += ", " + parentFile.getAbsolutePath() + ":" + parentLineNum; + } + throw new UserException(message); + } + } else { + String[] filterEntry = filter.split(":", 2); + + String message = null; + if (filterEntry.length != 2) { + message = "Invalid read group filter: " + filter; + } else if (filterEntry[0].length() != 2) { + message = "Tag is not two characters: " + filter; + } + + if (message != null) { + if (parentFile != null) { + message += ", " + parentFile.getAbsolutePath() + ":" + parentLineNum; + } + message += ", format is :"; + throw new UserException(message); + } + + if (!filters.containsKey(filterEntry[0])) + filters.put(filterEntry[0], new TreeSet()); + filters.get(filterEntry[0]).add(filterEntry[1]); + } + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadLengthFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadLengthFilter.java new file mode 100644 index 000000000..f9a6fab57 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadLengthFilter.java @@ -0,0 +1,63 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.commandline.Argument; + +/** + * Filter out reads based on length + * + *

    This filter is useful for running on only reads that are longer (or shorter) than the given threshold sizes.

    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf ReadLength \
    + *         -minRead 50 \
    + *         -maxRead 101
    + * 
    + * + * @author mhanna + * @version 0.1 + */ +public class ReadLengthFilter extends ReadFilter { + @Argument(fullName = "maxReadLength", shortName = "maxRead", doc="Discard reads with length greater than the specified value", required=true) + private int maxReadLength; + + @Argument(fullName = "minReadLength", shortName = "minRead", doc="Discard reads with length shorter than the specified value", required=true) + private int minReadLength = 1; + public boolean filterOut(SAMRecord read) { + // check the length + return read.getReadLength() > maxReadLength || read.getReadLength() < minReadLength; + } + +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadNameFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadNameFilter.java new file mode 100644 index 000000000..cdee7e14b --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadNameFilter.java @@ -0,0 +1,59 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.commandline.Argument; + +/** + * Only use reads with this read name + * + *

    This filter is useful for isolating a particular read, pair of reads or or set of alignments for a given read + * when troubleshooting issues where the error message provided a culprit name.

    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf ReadName \
    + *         -rn read_name
    + * 
    + * + * @author chartl + * @since 9/19/11 + */ +public class ReadNameFilter extends ReadFilter { + @Argument(fullName = "readName", shortName = "rn", doc="Read name to whitelist", required=true) + private String readName; + + public boolean filterOut(final SAMRecord rec) { + return ! rec.getReadName().equals(readName); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadStrandFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadStrandFilter.java new file mode 100644 index 000000000..292803d1c --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadStrandFilter.java @@ -0,0 +1,62 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.commandline.Argument; + +/** + * Filter out reads based on strand orientation + * + *

    This filter is useful for isolating reads from only forward or reverse strands. By default, it filters out reads + * from the negative (reverse) strand. This logic can be reversed by using the -filterPositive flag.

    + * + *

    Usage example

    + * + *

    Set the read strand filter to filter out positive (forward) strand reads

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf ReadStrand \
    + *         -filterPositive
    + * 
    + * + * @author chartl + * @version 0.1 + */ +public class ReadStrandFilter extends ReadFilter { + @Argument(fullName = "filterPositive", shortName = "fp", doc="Discard reads on the forward strand",required=false) + boolean filterForward = false; + + public boolean filterOut(SAMRecord read) { + // check the length + return read.getReadNegativeStrandFlag() != filterForward; + } + +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignMappingQualityFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignMappingQualityFilter.java new file mode 100644 index 000000000..89be38db7 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignMappingQualityFilter.java @@ -0,0 +1,88 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.commandline.Argument; + +/** + * Set the mapping quality of all reads to a given value. + * + *

    + * If a BAM file contains erroneous or missing mapping qualities (MAPQ), this read transformer will set all your + * mapping qualities to a given value (see arguments list for default value). + *

    + * + *

    See also

    + * + *

    ReassignOneMappingQualityFilter: reassigns a single MAPQ value, as opposed to all those found in the BAM file.

    + * + *

    Caveats

    + * + *

    Note that due to the order of operations involved in applying filters, it is possible that other read filters + * (determined either at command-line or internally by the tool you are using) will be applied to your data before + * this read transformation can be applied. If one of those other filters acts on the read mapping quality (MAPQ), + * then you may not obtain the expected results. Unfortunately it is currently not possible to change the order of + * operations from command line. To avoid the problem, we recommend applying this filter separately from any other + * analysis, using PrintReads.

    + * + * + *

    Input

    + *

    + * BAM file(s) + *

    + * + *

    Output

    + *

    + * BAM file(s) with the mapping qualities of all reads reassigned to the specified value + *

    + * + *

    Usage example

    + *
    + *  java -jar GenomeAnalysisTK.jar \
    + *      -T PrintReads \
    + *      -R reference.fasta \
    + *      -I input.bam \
    + *      -o output.file \
    + *      -rf ReassignMappingQuality \
    + *      -DMQ 35
    + *  
    + * + * @author carneiro + * @since 8/8/11 + */ + +public class ReassignMappingQualityFilter extends ReadFilter { + + @Argument(fullName = "default_mapping_quality", shortName = "DMQ", doc = "Default read mapping quality to assign to all reads", required = false) + public int defaultMappingQuality = 60; + + public boolean filterOut(SAMRecord rec) { + rec.setMappingQuality(defaultMappingQuality); + return false; + } +} + diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignOneMappingQualityFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignOneMappingQualityFilter.java new file mode 100644 index 000000000..2ff1d5a4e --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignOneMappingQualityFilter.java @@ -0,0 +1,90 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.commandline.Argument; + +/** + * Set the mapping quality of reads with a given value to another given value. + * + *

    + * This read transformer will change a certain read mapping quality to a different value without affecting reads that + * have other mapping qualities. This is intended primarily for users of RNA-Seq data handling programs such + * as TopHat, which use MAPQ = 255 to designate uniquely aligned reads. According to convention, 255 normally + * designates "unknown" quality, and most GATK tools automatically ignore such reads. By reassigning a different + * mapping quality to those specific reads, users of TopHat and other tools can circumvent this problem without + * affecting the rest of their dataset. + *

    + * + *

    + * This differs from the ReassignMappingQuality filter by its selectivity -- only one mapping quality is targeted. + * ReassignMappingQuality will change ALL mapping qualities to a single one, and is typically used for datasets + * that have no assigned mapping qualities. + *

    + * + *

    Input

    + *

    + * BAM file(s) + *

    + * + * + *

    Output

    + *

    + * BAM file(s) with one read mapping quality selectively reassigned as desired + *

    + * + *

    Usage example

    + *
    + *    java -jar GenomeAnalysisTK.jar \
    + *      -T PrintReads \
    + *      -R reference.fasta \
    + *      -I input.bam \
    + *      -o output.file \
    + *      -rf ReassignOneMappingQuality \
    + *      -RMQF 255 \
    + *      -RMQT 60
    + *  
    + * + * @author vdauwera + * @since 2/19/13 + */ + +public class ReassignOneMappingQualityFilter extends ReadFilter { + + @Argument(fullName = "reassign_mapping_quality_from", shortName = "RMQF", doc = "Original mapping quality", required = false) + public int reassignMappingQualityFrom = 255; + + @Argument(fullName = "reassign_mapping_quality_to", shortName = "RMQT", doc = "Desired mapping quality", required = false) + public int reassignMappingQualityTo = 60; + + public boolean filterOut(SAMRecord rec) { + if (rec.getMappingQuality() == reassignMappingQualityFrom) + rec.setMappingQuality(reassignMappingQualityTo); + return false; + } +} + diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SampleFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SampleFilter.java new file mode 100644 index 000000000..ab63e1e00 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SampleFilter.java @@ -0,0 +1,61 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMReadGroupRecord; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.commandline.Argument; + +import java.util.Set; + +/** + * Only use reads belonging to a specific sample + * + *

    This filter is useful for isolating data from one particular sample in a multisample file.

    + * + *

    Usage example

    + * + *

    Use only reads from the sample named NA12878

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf Sample \
    + *         -goodSM NA12878
    + * 
    + * + */ +public class SampleFilter extends ReadFilter { + @Argument(fullName = "sample_to_keep", shortName = "goodSM", doc="The name of the sample(s) to keep, filtering out all others", required=true) + private Set SAMPLES_TO_KEEP = null; + + public boolean filterOut( final SAMRecord read ) { + final SAMReadGroupRecord readGroup = read.getReadGroup(); + return !( readGroup != null && SAMPLES_TO_KEEP.contains(readGroup.getSample()) ); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SingleReadGroupFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SingleReadGroupFilter.java new file mode 100644 index 000000000..58cf9183d --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SingleReadGroupFilter.java @@ -0,0 +1,63 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMReadGroupRecord; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.commandline.Argument; + +/** + * Only use reads from the specified read group + * + *

    This filter is useful for isolating data from one particular read group (usually a single lane).

    + * + *

    Usage example

    + * + *

    Use only reads from the read group with ID "read_group_1

    + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf SingleReadGroup \
    + *         -goodRG read_group_1
    + * 
    + * + * @author rpoplin + * @since Nov 27, 2009 + * + */ + +public class SingleReadGroupFilter extends ReadFilter { + @Argument(fullName = "read_group_to_keep", shortName = "goodRG", doc="The name of the read group to keep, filtering out all others", required=true) + private String READ_GROUP_TO_KEEP = null; + + public boolean filterOut( final SAMRecord read ) { + final SAMReadGroupRecord readGroup = read.getReadGroup(); + return !( readGroup != null && readGroup.getReadGroupId().equals( READ_GROUP_TO_KEEP ) ); + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/UnmappedReadFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/UnmappedReadFilter.java new file mode 100644 index 000000000..d5f8d30ff --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/UnmappedReadFilter.java @@ -0,0 +1,56 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.SAMRecord; + +/** + * Filter out unmapped reads + * + * + *

    This filter recognizes the SAM flag corresponding to being unmapped. It is intended to ensure that only + * reads that are likely to be mapped in the right place, and therefore to be informative, will be used in analysis.

    + * + *

    Usage example

    + * + *
    + *     java -jar GenomeAnalysisTk.jar \
    + *         -T ToolName \
    + *         -R reference.fasta \
    + *         -I input.bam \
    + *         -o output.file \
    + *         -rf UnmappedRead
    + * 
    + * + * @author rpoplin + * @since Dec 9, 2009 + */ + +public class UnmappedReadFilter extends ReadFilter { + public boolean filterOut( final SAMRecord read ) { + return read.getReadUnmappedFlag() || read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START; + } +} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/package-info.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/package-info.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/package-info.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/BySampleSAMFileWriter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/BySampleSAMFileWriter.java new file mode 100644 index 000000000..6dd8833b8 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/BySampleSAMFileWriter.java @@ -0,0 +1,70 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.io; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMProgramRecord; +import htsjdk.samtools.SAMReadGroupRecord; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.util.HashMap; +import java.util.Map; + +/** + * Created by IntelliJ IDEA. + * User: carneiro + * Date: Nov 13 + */ +public class BySampleSAMFileWriter extends NWaySAMFileWriter { + + private final Map sampleToWriterMap; + + public BySampleSAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { + super(toolkit, ext, order, presorted, indexOnTheFly, generateMD5, pRecord, keep_records); + + sampleToWriterMap = new HashMap(toolkit.getSAMFileHeader().getReadGroups().size() * 2); + + for (SAMReaderID readerID : toolkit.getReadsDataSource().getReaderIDs()) { + for (SAMReadGroupRecord rg : toolkit.getReadsDataSource().getHeader(readerID).getReadGroups()) { + String sample = rg.getSample(); + if (sampleToWriterMap.containsKey(sample) && sampleToWriterMap.get(sample) != readerID) { + throw new ReviewedGATKException("The same sample appears in multiple files, this input cannot be multiplexed using the BySampleSAMFileWriter, try NWaySAMFileWriter instead."); + } + else { + sampleToWriterMap.put(sample, readerID); + } + } + } + } + + @Override + public void addAlignment(SAMRecord samRecord) { + super.addAlignment(samRecord, sampleToWriterMap.get(samRecord.getReadGroup().getSample())); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/DirectOutputTracker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/DirectOutputTracker.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/DirectOutputTracker.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/DirectOutputTracker.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/FastqFileWriter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/FastqFileWriter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/FastqFileWriter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/FastqFileWriter.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/NWaySAMFileWriter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/NWaySAMFileWriter.java new file mode 100644 index 000000000..74ed19d3e --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/NWaySAMFileWriter.java @@ -0,0 +1,255 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.io; + +import htsjdk.samtools.*; +import htsjdk.samtools.util.ProgressLoggerInterface; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.broadinstitute.gatk.utils.exceptions.GATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.sam.GATKSAMFileWriter; + +import java.io.File; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: May 31, 2011 + * Time: 3:52:49 PM + * To change this template use File | Settings | File Templates. + */ +public class NWaySAMFileWriter implements SAMFileWriter { + + private Map writerMap = null; + private boolean presorted ; + GenomeAnalysisEngine toolkit; + boolean KEEP_ALL_PG_RECORDS = false; + + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { + this.presorted = presorted; + this.toolkit = toolkit; + this.KEEP_ALL_PG_RECORDS = keep_records; + writerMap = new HashMap(); + setupByReader(toolkit,in2out,order, presorted, indexOnTheFly, generateMD5, pRecord); + } + + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly , boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { + this.presorted = presorted; + this.toolkit = toolkit; + this.KEEP_ALL_PG_RECORDS = keep_records; + writerMap = new HashMap(); + setupByReader(toolkit,ext,order, presorted, indexOnTheFly, generateMD5, pRecord); + } + + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5) { + this(toolkit, in2out, order, presorted, indexOnTheFly, generateMD5, null,false); + } + + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly , boolean generateMD5) { + this(toolkit, ext, order, presorted, indexOnTheFly, generateMD5, null,false); + } + + /** + * Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and sets + * up the writer with the header and presorted status. + * + * @param originalHeader original header + * @param programRecord the program record for this program + */ + public static SAMFileHeader setupWriter(final SAMFileHeader originalHeader, final SAMProgramRecord programRecord) { + final SAMFileHeader header = originalHeader.clone(); + final List oldRecords = header.getProgramRecords(); + final List newRecords = new ArrayList(oldRecords.size()+1); + for ( SAMProgramRecord record : oldRecords ) + if ( (programRecord != null && !record.getId().startsWith(programRecord.getId()))) + newRecords.add(record); + + if (programRecord != null) { + newRecords.add(programRecord); + header.setProgramRecords(newRecords); + } + return header; + } + + /** + * Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and returns + * the new header to be added to the BAM writer. + * + * @param toolkit the engine + * @param walker the walker object (so we can extract the command line) + * @param PROGRAM_RECORD_NAME the name for the PG tag + * @return a pre-filled header for the bam writer + */ + public static SAMFileHeader setupWriter(final GenomeAnalysisEngine toolkit, final SAMFileHeader originalHeader, final Object walker, final String PROGRAM_RECORD_NAME) { + final SAMProgramRecord programRecord = createProgramRecord(toolkit, walker, PROGRAM_RECORD_NAME); + return setupWriter(originalHeader, programRecord); + } + + /** + * Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and sets + * up the writer with the header and presorted status. + * + * @param writer BAM file writer + * @param toolkit the engine + * @param preSorted whether or not the writer can assume reads are going to be added are already sorted + * @param walker the walker object (so we can extract the command line) + * @param PROGRAM_RECORD_NAME the name for the PG tag + */ + public static void setupWriter(GATKSAMFileWriter writer, GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean preSorted, Object walker, String PROGRAM_RECORD_NAME) { + SAMFileHeader header = setupWriter(toolkit, originalHeader, walker, PROGRAM_RECORD_NAME); + writer.writeHeader(header); + writer.setPresorted(preSorted); + } + + /** + * Creates a program record (@PG) tag + * + * @param toolkit the engine + * @param walker the walker object (so we can extract the command line) + * @param PROGRAM_RECORD_NAME the name for the PG tag + * @return a program record for the tool + */ + public static SAMProgramRecord createProgramRecord(GenomeAnalysisEngine toolkit, Object walker, String PROGRAM_RECORD_NAME) { + final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME); + try { + programRecord.setProgramVersion(CommandLineProgram.getVersionNumber()); + } catch (MissingResourceException e) { + // couldn't care less if the resource is missing... + } + programRecord.setCommandLine(toolkit.createApproximateCommandLineArgumentString(toolkit, walker)); + return programRecord; + } + + /** + * Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK engine (those will be retrieved + * from toolkit). The in2out map must contain an entry for each input filename and map it + * onto a unique output file name. + * @param toolkit + * @param in2out + */ + public void setupByReader(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) { + if ( in2out==null ) throw new GATKException("input-output bam filename map for n-way-out writing is NULL"); + for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { + + String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName(); + + String outName; + if ( ! in2out.containsKey(fName) ) + throw new UserException.BadInput("Input-output bam filename map does not contain an entry for the input file "+fName); + outName = in2out.get(fName); + + if ( writerMap.containsKey( rid ) ) + throw new GATKException("nWayOut mode: Reader id for input sam file "+fName+" is already registered; "+ + "map file likely contains multiple entries for this input file"); + + addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord); + } + + } + + /** + * Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK engine (those will be retrieved + * from toolkit). The output file names will be generated automatically by stripping ".sam" or ".bam" off the + * input file name and adding ext instead (e.g. ".cleaned.bam"). + * onto a unique output file name. + * @param toolkit + * @param ext + */ + public void setupByReader(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) { + for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { + + String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName(); + + String outName; + int pos ; + if ( fName.toUpperCase().endsWith(".BAM") ) pos = fName.toUpperCase().lastIndexOf(".BAM"); + else { + if ( fName.toUpperCase().endsWith(".SAM") ) pos = fName.toUpperCase().lastIndexOf(".SAM"); + else throw new UserException.BadInput("Input file name "+fName+" does not end with .sam or .bam"); + } + String prefix = fName.substring(0,pos); + outName = prefix+ext; + + if ( writerMap.containsKey( rid ) ) + throw new GATKException("nWayOut mode: Reader id for input sam file "+fName+" is already registered"); + addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord); + } + + } + + private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted, + boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord programRecord) { + File f = new File(outName); + SAMFileHeader header = setupWriter(toolkit.getSAMFileHeader(id), programRecord); + SAMFileWriterFactory factory = new SAMFileWriterFactory(); + factory.setCreateIndex(indexOnTheFly); + factory.setCreateMd5File(generateMD5); + SAMFileWriter sw = factory.makeSAMOrBAMWriter(header, presorted, f); + writerMap.put(id,sw); + } + + public Collection getWriters() { + return writerMap.values(); + } + + public void addAlignment(SAMRecord samRecord) { + final SAMReaderID id = toolkit.getReaderIDForRead(samRecord); + String rg = samRecord.getStringAttribute("RG"); + if ( rg != null ) { + String rg_orig = toolkit.getReadsDataSource().getOriginalReadGroupId(rg); + samRecord.setAttribute("RG",rg_orig); + } + addAlignment(samRecord, id); + } + + public void addAlignment(SAMRecord samRecord, SAMReaderID readerID) { + writerMap.get(readerID).addAlignment(samRecord); + } + + public SAMFileHeader getFileHeader() { + return toolkit.getSAMFileHeader(); + } + + public void close() { + for ( SAMFileWriter w : writerMap.values() ) w.close(); + } + + @Override + public void setProgressLogger(final ProgressLoggerInterface logger) { + for (final SAMFileWriter writer: writerMap.values()) { + writer.setProgressLogger(logger); + } + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/OutputTracker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/OutputTracker.java new file mode 100644 index 000000000..d5925900c --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/OutputTracker.java @@ -0,0 +1,193 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.io; + +import htsjdk.samtools.ValidationStringency; +import org.broadinstitute.gatk.utils.io.ReferenceBacked; +import org.broadinstitute.gatk.utils.commandline.ArgumentSource; +import org.broadinstitute.gatk.engine.io.storage.Storage; +import org.broadinstitute.gatk.engine.io.storage.StorageFactory; +import org.broadinstitute.gatk.engine.io.stubs.OutputStreamStub; +import org.broadinstitute.gatk.engine.io.stubs.Stub; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.classloader.JVMUtils; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.io.IOUtils; +import org.broadinstitute.gatk.utils.sam.SAMReaderBuilder; + +import java.io.File; +import java.io.OutputStream; +import java.lang.reflect.Field; +import java.util.HashMap; +import java.util.Map; + +/** + * Manages the output and err streams that are created specifically for walker + * output. + */ +public abstract class OutputTracker implements ReferenceBacked { + /** + * The reference file. + */ + private File referenceFile; + + /** + * The streams to which walker users should be reading directly. + */ + protected Map inputs = new HashMap(); + + /** + * The streams to which walker users should be writing directly. + */ + protected Map outputs = new HashMap(); + + /** + * Special-purpose stub. Provides a connection to output streams. + */ + protected OutputStreamStub outStub = null; + + /** + * Special-purpose stream. Provides a connection to error streams. + */ + protected OutputStreamStub errStub = null; + + /** + * Gets the output storage associated with a given stub. + * @param stub The stub for which to find / create the right output stream. + * @param Type of the stream to create. + * @return Storage object with a facade of type T. + */ + public abstract T getStorage( Stub stub ); + + @Override + public File getReferenceFile() { + return referenceFile; + } + + @Override + public void setReferenceFile(final File referenceFile) { + this.referenceFile = referenceFile; + } + + public void prepareWalker( Walker walker, ValidationStringency strictnessLevel ) { + for( Map.Entry io: inputs.entrySet() ) { + ArgumentSource targetField = io.getKey(); + Object targetValue = io.getValue(); + + // Ghastly hack: reaches in and finishes building out the SAMFileReader. + // TODO: Generalize this, and move it to its own initialization step. + if( targetValue instanceof SAMReaderBuilder) { + SAMReaderBuilder builder = (SAMReaderBuilder)targetValue; + builder.setValidationStringency(strictnessLevel); + targetValue = builder.build(); + } + + JVMUtils.setFieldValue( targetField.field, walker, targetValue ); + } + } + + /** + * Provide a mechanism for injecting supplemental streams for external management. + * @param argumentSource source Class / field into which to inject this stream. + * @param stub Stream to manage. + */ + public void addInput( ArgumentSource argumentSource, Object stub ) { + inputs.put(argumentSource,stub); + } + + /** + * Provide a mechanism for injecting supplemental streams for external management. + * @param stub Stream to manage. + */ + public void addOutput(Stub stub) { + addOutput(stub,null); + } + + /** + * Provide a mechanism for injecting supplemental streams for external management. + * @param stub Stream to manage. + */ + public void addOutput(Stub stub, Storage storage) { + stub.register(this); + outputs.put(stub,storage); + validateOutputPath(stub); + } + + /** + * Close down all existing output streams. + */ + public void close() { + for( Stub stub: outputs.keySet() ) { + // If the stream hasn't yet been created, create it so that there's at least an empty file present. + if( outputs.get(stub) == null ) + getTargetStream(stub); + + // Close down the storage. + outputs.get(stub).close(); + } + } + + /** + * Collects the target stream for this data. + * @param stub The stub for this stream. + * @param type of stub. + * @return An instantiated file into which data can be written. + */ + protected T getTargetStream( Stub stub ) { + if( !outputs.containsKey(stub) ) + throw new ReviewedGATKException("OutputTracker was not notified that this stub exists: " + stub); + Storage storage = outputs.get(stub); + if( storage == null ) { + storage = StorageFactory.createStorage(stub); + outputs.put(stub,storage); + } + return (T)storage; + } + + /** + * Ensures that the File associated with this stub (if any) is in a writable location + * @param stub + */ + protected void validateOutputPath(final Stub stub) { + if (stub.getOutputFile() != null && !(IOUtils.isSpecialFile(stub.getOutputFile()))) { + final File parentDir = stub.getOutputFile().getAbsoluteFile().getParentFile(); + if (! (parentDir.canWrite() && parentDir.canExecute())) + throw new UserException.CouldNotCreateOutputFile(stub.getOutputFile(), + "either the containing directory doesn't exist or it isn't writable"); + } + } + + /** + * Install an OutputStreamStub into the given fieldName of the given walker. + * @param walker Walker into which to inject the field name. + * @param fieldName Name of the field into which to inject the stub. + */ + private void installStub( Walker walker, String fieldName, OutputStream outputStream ) { + Field field = JVMUtils.findField( walker.getClass(), fieldName ); + JVMUtils.setFieldValue( field, walker, outputStream ); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/ThreadGroupOutputTracker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/ThreadGroupOutputTracker.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/ThreadGroupOutputTracker.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/ThreadGroupOutputTracker.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/storage/OutputStreamStorage.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/OutputStreamStorage.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/storage/OutputStreamStorage.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/OutputStreamStorage.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/SAMFileWriterStorage.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/SAMFileWriterStorage.java new file mode 100644 index 000000000..68943f887 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/SAMFileWriterStorage.java @@ -0,0 +1,174 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.io.storage; + +import htsjdk.samtools.*; +import htsjdk.samtools.util.CloseableIterator; +import htsjdk.samtools.util.ProgressLoggerInterface; +import htsjdk.samtools.util.RuntimeIOException; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.engine.io.stubs.SAMFileWriterStub; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.sam.SimplifyingSAMFileWriter; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; + +/** + * Provides temporary storage for SAMFileWriters. + * + * @author mhanna + * @version 0.1 + */ +public class SAMFileWriterStorage implements SAMFileWriter, Storage { + private final File file; + private File referenceFasta; + private SAMFileWriter writer; + + private static Logger logger = Logger.getLogger(SAMFileWriterStorage.class); + + public SAMFileWriterStorage( SAMFileWriterStub stub ) { + this(stub,stub.getOutputFile()); + } + + public SAMFileWriterStorage( SAMFileWriterStub stub, File file ) { + this.referenceFasta = stub.getReferenceFile(); + this.file = file; + SAMFileWriterFactory factory = new SAMFileWriterFactory(); + // Enable automatic index creation for pre-sorted BAMs. + if (stub.getFileHeader().getSortOrder().equals(SAMFileHeader.SortOrder.coordinate) && stub.getIndexOnTheFly()) + factory.setCreateIndex(true); + if (stub.getGenerateMD5()) + factory.setCreateMd5File(true); + // Adjust max records in RAM. + // TODO -- this doesn't actually work because of a bug in Picard; do not use until fixed + if(stub.getMaxRecordsInRam() != null) + factory.setMaxRecordsInRam(stub.getMaxRecordsInRam()); + + if(stub.getOutputFile() != null) { + try { + if (stub.getOutputFile().getName().toLowerCase().endsWith(".cram")) { + this.writer = createCRAMWriter(factory, stub.getFileHeader(), new FileOutputStream(file), this.referenceFasta); + } else { + this.writer = createBAMWriter(factory,stub.getFileHeader(),stub.isPresorted(),file,stub.getCompressionLevel()); + } + } catch(IOException ex) { + throw new UserException.CouldNotCreateOutputFile(file, "file could not be created", ex); + } catch(RuntimeIOException ex) { + throw new UserException.CouldNotCreateOutputFile(file,"file could not be created",ex); + } + } + else if(stub.getOutputStream() != null){ + this.writer = factory.makeSAMWriter( stub.getFileHeader(), stub.isPresorted(), stub.getOutputStream()); + } + else + throw new UserException("Unable to write to SAM file; neither a target file nor a stream has been specified"); + + // if we want to send the BAM file through the simplifying writer, wrap it here + if ( stub.simplifyBAM() ) { + this.writer = new SimplifyingSAMFileWriter(this.writer); + } + } + + public SAMFileHeader getFileHeader() { + return writer.getFileHeader(); + } + + public void addAlignment( SAMRecord read ) { + writer.addAlignment(read); + } + + public void close() { + try { + writer.close(); + } catch (RuntimeIOException e) { + throw new UserException.ErrorWritingBamFile(e.getMessage()); + } + } + + public void mergeInto( SAMFileWriter targetStream ) { + SAMFileReader reader = new SAMFileReader( file ); + try { + CloseableIterator iterator = reader.iterator(); + while( iterator.hasNext() ) + targetStream.addAlignment( iterator.next() ); + iterator.close(); + } + finally { + reader.close(); + file.delete(); + } + } + + private SAMFileWriter createCRAMWriter(final SAMFileWriterFactory factory, + final SAMFileHeader header, + final OutputStream outputStream, + final File referenceFasta) { + return factory.makeCRAMWriter(header, outputStream, referenceFasta); + } + + private SAMFileWriter createBAMWriter(final SAMFileWriterFactory factory, + final SAMFileHeader header, + final boolean presorted, + final File outputFile, + final Integer compressionLevel) { + SAMFileWriter writer; + if(compressionLevel != null) + writer = factory.makeBAMWriter(header, presorted, outputFile, compressionLevel); + else + writer = factory.makeBAMWriter(header, presorted, outputFile); + + // mhanna - 1 Mar 2011 - temporary hack until Picard generates an index file for empty BAMs -- + // - do a pre-initialization of the BAM file. + try { + Method prepareToWriteAlignmentsMethod = writer.getClass().getDeclaredMethod("prepareToWriteAlignments"); + if(prepareToWriteAlignmentsMethod != null) { + prepareToWriteAlignmentsMethod.setAccessible(true); + prepareToWriteAlignmentsMethod.invoke(writer); + } + } + catch(NoSuchMethodException ex) { + logger.info("Unable to call prepareToWriteAlignments method; this should be reviewed when Picard is updated."); + } + catch(IllegalAccessException ex) { + logger.info("Unable to access prepareToWriteAlignments method; this should be reviewed when Picard is updated."); + } + catch(InvocationTargetException ex) { + logger.info("Unable to invoke prepareToWriteAlignments method; this should be reviewed when Picard is updated."); + } + + return writer; + } + + @Override + public void setProgressLogger(final ProgressLoggerInterface logger) { + writer.setProgressLogger(logger); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/storage/Storage.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/Storage.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/storage/Storage.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/Storage.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/storage/StorageFactory.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/StorageFactory.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/storage/StorageFactory.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/StorageFactory.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/VariantContextWriterStorage.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/VariantContextWriterStorage.java new file mode 100644 index 000000000..a54d2ffac --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/VariantContextWriterStorage.java @@ -0,0 +1,228 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.io.storage; + +import htsjdk.samtools.util.BlockCompressedOutputStream; +import org.apache.log4j.Logger; +import htsjdk.tribble.AbstractFeatureReader; +import htsjdk.tribble.Feature; +import htsjdk.tribble.FeatureCodec; +import org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub; +import org.broadinstitute.gatk.utils.refdata.tracks.FeatureManager; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import htsjdk.variant.bcf2.BCF2Utils; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.writer.Options; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory; +import htsjdk.variant.vcf.VCFHeader; + +import java.io.*; +import java.util.Arrays; +import java.util.EnumSet; +import java.util.List; + +/** + * Provides temporary and permanent storage for genotypes in VCF format. + * + * @author mhanna + * @version 0.1 + */ +public class VariantContextWriterStorage implements Storage, VariantContextWriter { + /** + * our log, which we want to capture anything from this class + */ + private static Logger logger = Logger.getLogger(VariantContextWriterStorage.class); + + private final static int BUFFER_SIZE = 1048576; + + protected final File file; + protected OutputStream stream; + protected final VariantContextWriter writer; + boolean closed = false; + + /** + * Constructs an object which will write directly into the output file provided by the stub. + * Intentionally delaying the writing of the header -- this should be filled in by the walker. + * + * Respecs the isCompressed() request in stub, so if isCompressed() is true then this + * will create a storage output that dumps output to a BlockCompressedOutputStream. + * + * @param stub Stub to use when constructing the output file. + */ + public VariantContextWriterStorage(VariantContextWriterStub stub) { + if ( stub.getOutputFile() != null ) { + this.file = stub.getOutputFile(); + writer = vcfWriterToFile(stub,stub.getOutputFile(),true,true); + } + else if ( stub.getOutputStream() != null ) { + this.file = null; + this.stream = stub.getOutputStream(); + writer = VariantContextWriterFactory.create(stream, + stub.getMasterSequenceDictionary(), stub.getWriterOptions(false)); + } + else + throw new ReviewedGATKException("Unable to create target to which to write; storage was provided with neither a file nor a stream."); + } + + /** + * Constructs an object which will redirect into a different file. + * + * Note that this function does not respect the isCompressed() request from the stub, in order + * to ensure that tmp. files can be read back in by the Tribble system, and merged with the mergeInto function. + * + * @param stub Stub to use when synthesizing file / header info. + * @param tempFile File into which to direct the output data. + */ + public VariantContextWriterStorage(VariantContextWriterStub stub, File tempFile) { + //logger.debug("Creating temporary output file " + tempFile.getAbsolutePath() + " for VariantContext output."); + this.file = tempFile; + this.writer = vcfWriterToFile(stub, file, false, false); + writer.writeHeader(stub.getVCFHeader()); + } + + /** + * common initialization routine for multiple constructors + * @param stub Stub to use when constructing the output file. + * @param file Target file into which to write VCF records. + * @param indexOnTheFly true to index the file on the fly. NOTE: will be forced to false for compressed files. + * @param allowCompressed if false, we won't compress the output, even if the stub requests it. Critical + * for creating temp. output files that will be subsequently merged, as these do not + * support compressed output + * @return A VCF writer for use with this class + */ + private VariantContextWriter vcfWriterToFile(final VariantContextWriterStub stub, + final File file, + final boolean indexOnTheFly, + final boolean allowCompressed) { + try { + // we cannot merge compressed outputs, so don't compress if allowCompressed is false, + // which is the case when we have a temporary output file for later merging + if ( allowCompressed && stub.isCompressed() ) + stream = new BlockCompressedOutputStream(file); + else + stream = new PrintStream(new BufferedOutputStream(new FileOutputStream(file), BUFFER_SIZE)); + } + catch(IOException ex) { + throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream", ex); + } + + EnumSet options = stub.getWriterOptions(indexOnTheFly); + VariantContextWriter writer = VariantContextWriterFactory.create(file, this.stream, stub.getMasterSequenceDictionary(), stub.getIndexCreator(), options); + + // if the stub says to test BCF, create a secondary writer to BCF and an 2 way out writer to send to both + // TODO -- remove me when argument generateShadowBCF is removed + if ( stub.alsoWriteBCFForTest() && ! VariantContextWriterFactory.isBCFOutput(file, options)) { + final File bcfFile = BCF2Utils.shadowBCF(file); + if ( bcfFile != null ) { + FileOutputStream bcfStream; + try { + bcfStream = new FileOutputStream(bcfFile); + } catch (FileNotFoundException e) { + throw new RuntimeException(bcfFile + ": Unable to create BCF writer", e); + } + + VariantContextWriter bcfWriter = VariantContextWriterFactory.create(bcfFile, bcfStream, stub.getMasterSequenceDictionary(), stub.getIndexCreator(), options); + writer = new TestWriter(writer, bcfWriter); + } + } + + return writer; + } + + private final static class TestWriter implements VariantContextWriter { + final List writers; + + private TestWriter(final VariantContextWriter ... writers) { + this.writers = Arrays.asList(writers); + } + + @Override + public void writeHeader(final VCFHeader header) { + for ( final VariantContextWriter writer : writers ) writer.writeHeader(header); + } + + @Override + public void close() { + for ( final VariantContextWriter writer : writers ) writer.close(); + } + + @Override + public void add(final VariantContext vc) { + for ( final VariantContextWriter writer : writers ) writer.add(vc); + } + } + + public void add(VariantContext vc) { + if ( closed ) throw new ReviewedGATKException("Attempting to write to a closed VariantContextWriterStorage " + vc.getStart() + " storage=" + this); + writer.add(vc); + } + + /** + * initialize this VCF header + * + * @param header the header + */ + public void writeHeader(VCFHeader header) { + writer.writeHeader(header); + } + + /** + * Close the VCF storage object. + */ + public void close() { + writer.close(); + closed = true; + } + + public void mergeInto(VariantContextWriterStorage target) { + try { + if ( ! closed ) + throw new ReviewedGATKException("Writer not closed, but we are merging into the file!"); + final String targetFilePath = target.file != null ? target.file.getAbsolutePath() : "/dev/stdin"; + logger.debug(String.format("Merging VariantContextWriterStorage from %s into %s", file.getAbsolutePath(), targetFilePath)); + + // use the feature manager to determine the right codec for the tmp file + // that way we don't assume it's a specific type + final FeatureManager.FeatureDescriptor fd = new FeatureManager().getByFiletype(file); + if ( fd == null ) + throw new UserException.LocalParallelizationProblem(file); + + final FeatureCodec codec = fd.getCodec(); + final AbstractFeatureReader source = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), codec, false); + + for ( final Feature vc : source.iterator() ) { + target.writer.add((VariantContext) vc); + } + + source.close(); + file.delete(); // this should be last to aid in debugging when the process fails + } catch (IOException e) { + throw new UserException.CouldNotReadInputFile(file, "Error reading file in VCFWriterStorage: ", e); + } + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/OutputStreamArgumentTypeDescriptor.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/OutputStreamArgumentTypeDescriptor.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/OutputStreamArgumentTypeDescriptor.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/OutputStreamArgumentTypeDescriptor.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/OutputStreamStub.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/OutputStreamStub.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/OutputStreamStub.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/OutputStreamStub.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterArgumentTypeDescriptor.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterArgumentTypeDescriptor.java new file mode 100644 index 000000000..dc4824c08 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterArgumentTypeDescriptor.java @@ -0,0 +1,106 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.io.stubs; + +import htsjdk.samtools.SAMFileWriter; +import org.broadinstitute.gatk.utils.commandline.*; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.sam.GATKSAMFileWriter; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.io.OutputStream; +import java.lang.reflect.Type; + +/** + * Insert a SAMFileWriterStub instead of a full-fledged concrete OutputStream implementations. + */ +public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor { + + /** + * The engine into which output stubs should be fed. + */ + private final GenomeAnalysisEngine engine; + + /** + * The default location to which data should be written if the user specifies no such location. + */ + private final OutputStream defaultOutputStream; + + /** + * Create a new SAMFileWriter argument, notifying the given engine when that argument has been created. + * @param engine Engine to add SAMFileWriter output to. + * @param defaultOutputStream the target for the data + */ + public SAMFileWriterArgumentTypeDescriptor( GenomeAnalysisEngine engine, OutputStream defaultOutputStream ) { + this.engine = engine; + this.defaultOutputStream = defaultOutputStream; + } + + @Override + public boolean supports( Class type ) { + return SAMFileWriter.class.equals(type) || GATKSAMFileWriter.class.equals(type); + } + + @Override + public boolean createsTypeDefault(ArgumentSource source) { + return !source.isRequired() && source.defaultsToStdout(); + } + + @Override + public String typeDefaultDocString(ArgumentSource source) { + return "stdout"; + } + + @Override + public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { + if(source.isRequired() || !source.defaultsToStdout()) + throw new ReviewedGATKException("BUG: tried to create type default for argument type descriptor that can't support a type default."); + SAMFileWriterStub stub = new SAMFileWriterStub(engine,defaultOutputStream); + engine.addOutput(stub); + return stub; + } + + @Override + public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches ) { + // Extract all possible parameters that could be passed to a BAM file writer? + ArgumentDefinition bamArgumentDefinition = createDefaultArgumentDefinition(source); + ArgumentMatchValue writerFileName = getArgumentValue( bamArgumentDefinition, matches ); + + // Create the stub + SAMFileWriterStub stub = null; // stub = new SAMFileWriterStub(engine, defaultOutputStream); + + if (writerFileName != null && writerFileName.asFile() != null ) { + stub = new SAMFileWriterStub(engine, writerFileName.asFile()); + + // WARNING: Side effects required by engine! + parsingEngine.addTags(stub,getArgumentTags(matches)); + engine.addOutput(stub); + } + + return stub; + } + +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterStub.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterStub.java new file mode 100644 index 000000000..50435eb53 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterStub.java @@ -0,0 +1,373 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.io.stubs; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMFileWriter; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.util.ProgressLoggerInterface; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; +import org.broadinstitute.gatk.engine.io.OutputTracker; +import org.broadinstitute.gatk.utils.io.ReferenceBacked; +import org.broadinstitute.gatk.utils.sam.GATKSAMFileWriter; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; +import org.broadinstitute.gatk.utils.baq.BAQ; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.io.File; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.List; + +/** + * A stub for routing and management of SAM file reading and writing. + * + * @author mhanna + * @version 0.1 + */ +public class SAMFileWriterStub implements Stub, GATKSAMFileWriter, ReferenceBacked { + /** + * Engine to use for collecting attributes for the output SAM file. + */ + private final GenomeAnalysisEngine engine; + + /** + * A header supplied by the user that overrides the merged header from the input BAM. + */ + private SAMFileHeader headerOverride = null; + + /** + * The sam file that this stub should write to. Should be passed along to + * whatever happens to create the StreamConnector. + */ + private final File samFile; + + /** + * The reference file for stub. + */ + private File referenceFile; + + /** + * The target output stream, to be used in place of the SAM file. + */ + private final OutputStream samOutputStream; + + /** + * The validation stringency to apply when reading this file. + */ + private Integer compressionLevel = null; + + /** + * Should the GATK index the output BAM on-the-fly? + */ + private boolean indexOnTheFly = false; + + /** + * Should the GATK generate an md5 for the output BAM? + */ + private boolean generateMD5 = false; + + /** + * Should this BAM be presorted? + */ + private boolean presorted = true; + + /** + * How many records should the BAM writer store in RAM while + * sorting the BAM on-the-fly? + */ + private Integer maxRecordsInRam = null; + + /** + * Connects this stub with an external stream capable of serving the + * requests of the consumer of this stub. + */ + private OutputTracker outputTracker = null; + + /** + * Has the write started? If so, throw an exception if someone tries to + * change write parameters to the file (compression level, presorted flag, + * header, etc). + */ + private boolean writeStarted = false; + + + /** + * HMM for BAQ, if needed + */ + BAQ baqHMM = new BAQ(); + + /** + * Should we simplify the BAM file while writing it out? + */ + private boolean simplifyBAM = false; + + private List onOutputReadTransformers = null; + + /** + * Create a new stub given the requested SAM file and compression level. + * @param engine source of header data, maybe other data about input files. + * @param samFile SAM file to (ultimately) create. + */ + public SAMFileWriterStub( GenomeAnalysisEngine engine, File samFile ) { + this(engine, samFile, null); + } + + /** + * Create a new stub given the requested SAM file and compression level. + * @param engine source of header data, maybe other data about input files. + * @param stream Output stream to which data should be written. + */ + public SAMFileWriterStub( GenomeAnalysisEngine engine, OutputStream stream ) { + this(engine, null, stream); + } + + private SAMFileWriterStub(final GenomeAnalysisEngine engine, final File samFile, final OutputStream stream) { + this.engine = engine; + this.samFile = samFile; + this.samOutputStream = stream; + } + + /** + * Creates a SAMFileWriter using all of the features currently set in the engine (command line arguments, ReadTransformers, etc) + * @param file the filename to write to + * @param engine the engine + * @return a SAMFileWriter with the correct options set + */ + public static SAMFileWriter createSAMFileWriter(final String file, final GenomeAnalysisEngine engine) { + final SAMFileWriterStub output = new SAMFileWriterStub(engine, new File(file)); + output.processArguments(engine.getArguments()); + return output; + } + + /** + * As {@link #createSAMFileWriter(String, org.broadinstitute.gatk.engine.GenomeAnalysisEngine)}, but also sets the header + */ + public static SAMFileWriter createSAMFileWriter(final String file, final GenomeAnalysisEngine engine, final SAMFileHeader header) { + final SAMFileWriterStub output = (SAMFileWriterStub) createSAMFileWriter(file, engine); + output.writeHeader(header); + return output; + } + + /** + * Retrieves the SAM file to (ultimately) be created. + * @return The SAM file. Must not be null. + */ + public File getOutputFile() { + return samFile; + } + + public boolean simplifyBAM() { + return simplifyBAM; + } + + public void setSimplifyBAM(boolean v) { + simplifyBAM = v; + } + + public OutputStream getOutputStream() { + return samOutputStream; + } + + @Override + public File getReferenceFile() { + return referenceFile; + } + + @Override + public void setReferenceFile(final File referenceFile) { + this.referenceFile = referenceFile; + } + + /** + * Retrieves the header to use when creating the new SAM file. + * @return header to use when creating the new SAM file. + */ + public SAMFileHeader getFileHeader() { + return headerOverride != null ? headerOverride : engine.getSAMFileHeader(); + } + + /** + * Retrieves the desired compression level for + * @return The current compression level. Could be null if the user doesn't care. + */ + public Integer getCompressionLevel() { + return compressionLevel; + } + + /** + * Sets the desired compression level. + * @param compressionLevel The suggested compression level. + */ + public void setCompressionLevel( Integer compressionLevel ) { + if(writeStarted) + throw new ReviewedGATKException("Attempted to change the compression level of a file with alignments already in it."); + this.compressionLevel = compressionLevel; + } + + /** + * Gets whether to index this output stream on-the-fly. + * @return True means create an index. False means skip index creation. + */ + public Boolean getIndexOnTheFly() { + return indexOnTheFly; + } + + /** + * Controls whether to index this output stream on-the-fly. + * @param indexOnTheFly True means create an index. False means skip index creation. + */ + public void setIndexOnTheFly( boolean indexOnTheFly ) { + if(writeStarted) + throw new UserException("Attempted to index a BAM on the fly of a file with alignments already in it."); + this.indexOnTheFly = indexOnTheFly; + } + + /** + * Gets whether to generate an md5 on-the-fly for this BAM. + * @return True generates the md5. False means skip writing the file. + */ + public Boolean getGenerateMD5() { + return generateMD5; + } + + /** + * Gets whether to generate an md5 on-the-fly for this BAM. + * @param generateMD5 True generates the md5. False means skip writing the file. + */ + public void setGenerateMD5(boolean generateMD5) { + if(writeStarted) + throw new UserException("Attempted to turn on md5 generation for BAM file with alignments already in it."); + this.generateMD5 = generateMD5; + } + + /** + * Whether the BAM file to create is actually presorted. + * @return True if the BAM file is presorted. False otherwise. + */ + public boolean isPresorted() { + return this.presorted; + } + + /** + * Set Whether the BAM file to create is actually presorted. + * @param presorted True if the BAM file is presorted. False otherwise. + */ + public void setPresorted(boolean presorted) { + if(writeStarted) + throw new ReviewedGATKException("Attempted to change the presorted state of a file with alignments already in it."); + this.presorted = presorted; + } + + /** + * Get the maximum number of reads to hold in RAM when sorting a BAM on-the-fly. + * @return Max records in RAM, or null if unset. + */ + public Integer getMaxRecordsInRam() { + return this.maxRecordsInRam; + } + + /** + * Sets the maximum number of reads to hold in RAM when sorting a BAM on-the-fly. + * @param maxRecordsInRam Max number of records in RAM. + */ + public void setMaxRecordsInRam(int maxRecordsInRam) { + if(writeStarted) + throw new ReviewedGATKException("Attempted to change the max records in RAM of a file with alignments already in it."); + this.maxRecordsInRam = maxRecordsInRam; + } + + /** + * Registers the given streamConnector with this stub. + * @param outputTracker The connector used to provide an appropriate stream. + */ + public void register( OutputTracker outputTracker ) { + this.outputTracker = outputTracker; + } + + @Override + public void processArguments( final GATKArgumentCollection argumentCollection ) { + if (argumentCollection.bamCompression != null) + setCompressionLevel(argumentCollection.bamCompression); + setGenerateMD5(argumentCollection.enableBAMmd5); + setIndexOnTheFly(!argumentCollection.disableBAMIndexing); + setSimplifyBAM(argumentCollection.simplifyBAM); + + } + + /** + * Use the given header as the target for this writer. + * @param header The header to write. + */ + public void writeHeader(SAMFileHeader header) { + if(writeStarted) + throw new ReviewedGATKException("Attempted to change the header of a file with alignments already in it."); + this.headerOverride = header; + } + + private void initializeReadTransformers() { + this.onOutputReadTransformers = new ArrayList<>(engine.getReadTransformers().size()); + for ( final ReadTransformer transformer : engine.getReadTransformers() ) { + if ( transformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_OUTPUT ) + onOutputReadTransformers.add(transformer); + } + } + + /** + * @{inheritDoc} + */ + public void addAlignment( final SAMRecord readIn ) { + if ( onOutputReadTransformers == null ) + initializeReadTransformers(); + + GATKSAMRecord workingRead = (GATKSAMRecord)readIn; + + // run on output read transformers + for ( final ReadTransformer transform : onOutputReadTransformers ) + workingRead = transform.apply(workingRead); + + writeStarted = true; + outputTracker.getStorage(this).addAlignment(workingRead); + } + + /** + * @{inheritDoc} + */ + public void close() { + outputTracker.getStorage(this).close(); + } + + /** + * @throws java.lang.UnsupportedOperationException No progress logging in this implementation. + */ + @Override + public void setProgressLogger(final ProgressLoggerInterface logger) { + throw new UnsupportedOperationException("Progress logging not supported"); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMReaderArgumentTypeDescriptor.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMReaderArgumentTypeDescriptor.java new file mode 100644 index 000000000..3a440488d --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMReaderArgumentTypeDescriptor.java @@ -0,0 +1,77 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.io.stubs; + +import htsjdk.samtools.SAMFileReader; +import org.broadinstitute.gatk.utils.commandline.*; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.sam.SAMReaderBuilder; + +import java.lang.reflect.Type; + +/** + * Describe how to parse SAMReaders. + */ +public class SAMReaderArgumentTypeDescriptor extends ArgumentTypeDescriptor { + /** + * The engine into which output stubs should be fed. + */ + private GenomeAnalysisEngine engine; + + /** + * Create a new SAMFileReader argument, notifying the given engine when that argument has been created. + * @param engine engine + */ + public SAMReaderArgumentTypeDescriptor(GenomeAnalysisEngine engine) { + this.engine = engine; + } + + @Override + public boolean supports( Class type ) { + return SAMFileReader.class.isAssignableFrom(type); + } + + @Override + public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches ) { + SAMReaderBuilder builder = new SAMReaderBuilder(); + + ArgumentMatchValue readerFileName = getArgumentValue( createDefaultArgumentDefinition(source), matches ); + + if( readerFileName == null ) + throw new UserException.CommandLineException("SAM file compression was supplied, but no associated writer was supplied with it."); + + builder.setSAMFile(readerFileName.asFile()); + + // WARNING: Skipping required side-effect because stub is impossible to generate. + engine.addInput(source, builder); + + // MASSIVE KLUDGE! SAMFileReader is tricky to implement and we don't yet have a stub. Return null, then + // let the output tracker load it in. + // TODO: Add a stub for SAMReader. + return null; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/Stub.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/Stub.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/Stub.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/Stub.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VCFWriterArgumentTypeDescriptor.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VCFWriterArgumentTypeDescriptor.java new file mode 100644 index 000000000..68163850d --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VCFWriterArgumentTypeDescriptor.java @@ -0,0 +1,138 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.io.stubs; + +import org.broadinstitute.gatk.utils.commandline.*; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.io.File; +import java.io.OutputStream; +import java.lang.reflect.Type; +import java.util.Collection; + +/** + * Injects new command-line arguments into the system providing support for the genotype writer. + * + * @author mhanna + * @version 0.1 + */ +public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor { + + /** + * The engine into which output stubs should be fed. + */ + private final GenomeAnalysisEngine engine; + + /** + * The default location to which data should be written if the user specifies no such location. + */ + private final OutputStream defaultOutputStream; + + /** + * The sources into which arguments were injected. + */ + private final Collection argumentSources; + + /** + * Create a new GenotypeWriter argument, notifying the given engine when that argument has been created. + * @param engine the engine to be notified. + * @param defaultOutputStream the default output stream to be written to if nothing else is specified. + * @param argumentSources sources from which command-line arguments should be derived. + */ + public VCFWriterArgumentTypeDescriptor(GenomeAnalysisEngine engine, OutputStream defaultOutputStream, Collection argumentSources) { + this.engine = engine; + this.defaultOutputStream = defaultOutputStream; + this.argumentSources = argumentSources; + } + + /** + * Reports whether this ArgumentTypeDescriptor supports the given type. + * @param type The type to check. + * @return True if the argument is a GenotypeWriter. + */ + @Override + public boolean supports( Class type ) { + return VariantContextWriter.class.equals(type); + } + + /** + * This command-line argument descriptor does want to override the provided default value. + * @return true always. + */ + @Override + public boolean createsTypeDefault(ArgumentSource source) { + return !source.isRequired() && source.defaultsToStdout(); + } + + @Override + public String typeDefaultDocString(ArgumentSource source) { + return "stdout"; + } + + @Override + public Object createTypeDefault(ParsingEngine parsingEngine, ArgumentSource source, Type type) { + if(source.isRequired() || !source.defaultsToStdout()) + throw new ReviewedGATKException("BUG: tried to create type default for argument type descriptor that can't support a type default."); + VariantContextWriterStub stub = new VariantContextWriterStub(engine, defaultOutputStream, argumentSources); + engine.addOutput(stub); + return stub; + } + + /** + * Convert the given argument matches into a single object suitable for feeding into the ArgumentSource. + * @param source Source for this argument. + * @param type not used + * @param matches Matches that match with this argument. + * @return Transform from the matches into the associated argument. + */ + @Override + public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches ) { + ArgumentDefinition defaultArgumentDefinition = createDefaultArgumentDefinition(source); + // Get the filename for the genotype file, if it exists. If not, we'll need to send output to out. + ArgumentMatchValue writerFileName = getArgumentValue(defaultArgumentDefinition,matches); + File writerFile = writerFileName != null ? writerFileName.asFile() : null; + + // This parser has been passed a null filename and the GATK is not responsible for creating a type default for the object; + // therefore, the user must have failed to specify a type default + if(writerFile == null && source.isRequired()) + throw new MissingArgumentValueException(defaultArgumentDefinition); + + // Create a stub for the given object. + final VariantContextWriterStub stub = (writerFile != null) + ? new VariantContextWriterStub(engine, writerFile, argumentSources) + : new VariantContextWriterStub(engine, defaultOutputStream, argumentSources); + + stub.setCompressed(isCompressed(writerFileName == null ? null: writerFileName.asString())); + + // WARNING: Side effects required by engine! + parsingEngine.addTags(stub,getArgumentTags(matches)); + engine.addOutput(stub); + + return stub; + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VariantContextWriterStub.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VariantContextWriterStub.java new file mode 100644 index 000000000..548bee887 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VariantContextWriterStub.java @@ -0,0 +1,303 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.io.stubs; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.tribble.index.IndexCreator; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; +import org.broadinstitute.gatk.engine.io.OutputTracker; +import org.broadinstitute.gatk.engine.GATKVCFUtils; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.writer.Options; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; + +import java.io.File; +import java.io.OutputStream; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Collection; +import java.util.EnumSet; +import java.util.List; + +/** + * A stub for routing and management of genotype reading and writing. + * + * @author ebanks + * @version 0.1 + */ +public class VariantContextWriterStub implements Stub, VariantContextWriter { + public final static boolean UPDATE_CONTIG_HEADERS = true; + + /** + * The engine, central to the GATK's processing. + */ + private final GenomeAnalysisEngine engine; + + /** + * The file that this stub should write to. Should be mutually + * exclusive with genotypeStream. + */ + private final File genotypeFile; + + /** + * The output stream to which stub data should be written. Will be + * mutually exclusive with genotypeFile. + */ + private final PrintStream genotypeStream; + + /** + * A hack: push the argument sources into the VCF header so that the VCF header + * can rebuild the command-line arguments. + */ + private final Collection argumentSources; + + /** + * Which IndexCreator to use + */ + private final IndexCreator indexCreator; + + /** + * The cached VCF header (initialized to null) + */ + private VCFHeader vcfHeader = null; + + /** + * Should we emit a compressed output stream? + */ + private boolean isCompressed = false; + + /** + * Should the header be written out? A hidden argument. + */ + private boolean skipWritingCommandLineHeader = false; + + /** + * Should we not write genotypes even when provided? + */ + private boolean doNotWriteGenotypes = false; + + /** + * Should we force BCF writing regardless of the file extension? + */ + private boolean forceBCF = false; + + /** + * Should we write all of the fields in the FORMAT field, even if missing fields could be trimmed? + */ + private boolean writeFullFormatField = false; + + /** + * Connects this stub with an external stream capable of serving the + * requests of the consumer of this stub. + */ + protected OutputTracker outputTracker = null; + + /** + * Create a new stub given the requested file. + * + * @param engine engine. + * @param genotypeFile file to (ultimately) create. + * @param argumentSources sources. + */ + public VariantContextWriterStub(GenomeAnalysisEngine engine, File genotypeFile, Collection argumentSources) { + this.engine = engine; + this.genotypeFile = genotypeFile; + this.genotypeStream = null; + + this.indexCreator = GATKVCFUtils.makeIndexCreator(engine.getArguments().variant_index_type, engine.getArguments().variant_index_parameter, + genotypeFile, null); + this.argumentSources = argumentSources; + } + + /** + * Create a new stub given the requested file. + * + * @param engine engine. + * @param genotypeStream stream to (ultimately) write. + * @param argumentSources sources. + */ + public VariantContextWriterStub(GenomeAnalysisEngine engine, OutputStream genotypeStream, Collection argumentSources) { + this.engine = engine; + this.genotypeFile = null; + this.genotypeStream = new PrintStream(genotypeStream); + this.indexCreator = null; + this.argumentSources = argumentSources; + } + + /** + * Retrieves the file to (ultimately) be created. + * @return The file. Can be null if genotypeStream is not. + */ + public File getOutputFile() { + return genotypeFile; + } + + /** + * Retrieves the output stream to which to (ultimately) write. + * @return The file. Can be null if genotypeFile is not. + */ + public OutputStream getOutputStream() { + return genotypeStream; + } + + public boolean isCompressed() { + return isCompressed; + } + + public void setCompressed(final boolean compressed) { + isCompressed = compressed; + } + + public void setSkipWritingCommandLineHeader(final boolean skipWritingCommandLineHeader) { + this.skipWritingCommandLineHeader = skipWritingCommandLineHeader; + } + + public void setDoNotWriteGenotypes(final boolean doNotWriteGenotypes) { + this.doNotWriteGenotypes = doNotWriteGenotypes; + } + + public void setForceBCF(final boolean forceBCF) { + this.forceBCF = forceBCF; + } + + public void setWriteFullFormatField(final boolean writeFullFormatField) { + this.writeFullFormatField = writeFullFormatField; + } + + public IndexCreator getIndexCreator() { + return indexCreator; + } + + /** + * Gets the master sequence dictionary from the engine associated with this stub + * @link GenomeAnalysisEngine.getMasterSequenceDictionary + * @return the master sequence dictionary from the engine associated with this stub + */ + public SAMSequenceDictionary getMasterSequenceDictionary() { + return engine.getMasterSequenceDictionary(); + } + + public EnumSet getWriterOptions() { + return getWriterOptions(false); + } + + public EnumSet getWriterOptions(boolean indexOnTheFly) { + final List options = new ArrayList<>(); + + if ( doNotWriteGenotypes ) options.add(Options.DO_NOT_WRITE_GENOTYPES); + if ( engine.lenientVCFProcessing() ) options.add(Options.ALLOW_MISSING_FIELDS_IN_HEADER); + if ( indexOnTheFly) options.add(Options.INDEX_ON_THE_FLY); + if ( writeFullFormatField ) options.add(Options.WRITE_FULL_FORMAT_FIELD); + + if ( forceBCF || (getOutputFile() != null && VariantContextWriterFactory.isBCFOutput(getOutputFile())) ) + options.add(Options.FORCE_BCF); + + return options.isEmpty() ? EnumSet.noneOf(Options.class) : EnumSet.copyOf(options); + } + + /** + * Retrieves the header to use when creating the new file. + * @return header to use when creating the new file. + */ + public VCFHeader getVCFHeader() { + return vcfHeader; + } + + /** + * Registers the given streamConnector with this stub. + * @param outputTracker The connector used to provide an appropriate stream. + */ + public void register( OutputTracker outputTracker ) { + this.outputTracker = outputTracker; + } + + @Override + public void processArguments( final GATKArgumentCollection argumentCollection ) { + setDoNotWriteGenotypes(argumentCollection.sitesOnlyVCF); + setSkipWritingCommandLineHeader(argumentCollection.disableCommandLineInVCF); + setForceBCF(argumentCollection.forceBCFOutput); + setWriteFullFormatField(argumentCollection.neverTrimVCFFormatField); + } + + public void writeHeader(VCFHeader header) { + vcfHeader = header; + + if ( header.isWriteEngineHeaders() ) { + // skip writing the command line header if requested + if ( ! skipWritingCommandLineHeader && header.isWriteCommandLine() ) { + // Always add the header line, as the current format allows multiple entries + final VCFHeaderLine commandLineArgHeaderLine = GATKVCFUtils.getCommandLineArgumentHeaderLine(vcfHeader, engine, argumentSources); + vcfHeader.addMetaDataLine(commandLineArgHeaderLine); + } + + if ( UPDATE_CONTIG_HEADERS ) + vcfHeader = GATKVCFUtils.withUpdatedContigs(vcfHeader, engine); + } + + outputTracker.getStorage(this).writeHeader(vcfHeader); + } + + /** + * @{inheritDoc} + */ + public void add(VariantContext vc) { + outputTracker.getStorage(this).add(vc); + } + + /** + * @{inheritDoc} + */ + public void close() { + outputTracker.getStorage(this).close(); + } + + /** + * Gets a string representation of this object. + * @return a string representation of this object. + */ + @Override + public String toString() { + return (getOutputFile() == null) ? "(Stream)" : getOutputFile().getAbsolutePath(); + } + + /** + * Should we also write a BCF file alongside our VCF file for testing + * + * TODO -- remove me when argument generateShadowBCF is removed + * + * @return + */ + public boolean alsoWriteBCFForTest() { + return engine.getArguments().numberOfDataThreads == 1 && // only works single threaded + ! isCompressed() && // for non-compressed outputs + getOutputFile() != null && // that are going to disk + engine.getArguments().generateShadowBCF; // and we actually want to do it + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/BoundedReadIterator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/BoundedReadIterator.java new file mode 100644 index 000000000..ecce811f9 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/BoundedReadIterator.java @@ -0,0 +1,160 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.iterators; + +import htsjdk.samtools.MergingSamRecordIterator; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; + +import java.util.Iterator; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + + +/** + * @author aaron + * @version 1.0 + * @date Apr 14, 2009 + *

    + * Class BoundedReadIterator + *

    + * This class implements a read iterator that is bounded by the number of reads + * it will produce over the iteration. + */ +public class BoundedReadIterator implements GATKSAMIterator { + + // the genome loc we're bounding + final private long readCount; + private long currentCount = 0; + + // the iterator we want to decorate + private final GATKSAMIterator iterator; + + // our unmapped read flag + private boolean doNotUseThatUnmappedReadPile = false; + + /** + * The next read that we've buffered. Null indicates that there's + * nothing in the buffer (not that there isn't a next read). + */ + private SAMRecord record = null; + + /** + * constructor + * @param iter + * @param readCount + */ + public BoundedReadIterator(GATKSAMIterator iter, long readCount) { + this.iterator = iter; + this.readCount = readCount; + } + + public void useUnmappedReads(boolean useThem) { + this.doNotUseThatUnmappedReadPile = useThem; + } + + public SAMFileHeader getHeader() { + // todo: this is bad, we need an iterface out there for samrecords that supports getting the header, + // regardless of the merging + if (iterator instanceof MergingSamRecordIterator) + return ((MergingSamRecordIterator)iterator).getMergedHeader(); + else + return null; + } + + /** + * Do we have a next? If the iterator has a read and we're not over the read + * count, then yes + * @return + */ + public boolean hasNext() { + if( record != null ) + return true; + + if (iterator.hasNext() && currentCount < readCount) { + record = iterator.next(); + ++currentCount; + if (record.getAlignmentStart() == 0 && doNotUseThatUnmappedReadPile) { + return false; + } + return true; + } else { + return false; + } + } + + /** + * get the next SAMRecord + * @return SAMRecord representing the next read + */ + public SAMRecord next() { + SAMRecord cached = record; + record = null; + return cached; + } + + /** + * this is unsupported on SAMRecord iterators + */ + public void remove() { + throw new UnsupportedOperationException("You cannot use an iterator to remove a SAMRecord"); + } + + /** + * close the iterator + */ + public void close() { + iterator.close(); + } + + public Iterator iterator() { + return this; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/GenomeLocusIterator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/GenomeLocusIterator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/GenomeLocusIterator.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/GenomeLocusIterator.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/IterableIterator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/IterableIterator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/IterableIterator.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/IterableIterator.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/MalformedBAMErrorReformatingIterator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/MalformedBAMErrorReformatingIterator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/MalformedBAMErrorReformatingIterator.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/MalformedBAMErrorReformatingIterator.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/MisencodedBaseQualityReadTransformer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/MisencodedBaseQualityReadTransformer.java new file mode 100644 index 000000000..ea2e081c7 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/MisencodedBaseQualityReadTransformer.java @@ -0,0 +1,94 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.iterators; + +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +/** + * Checks for and errors out (or fixes if requested) when it detects reads with base qualities that are not encoded with + * phred-scaled quality scores. Q0 == ASCII 33 according to the SAM specification, whereas Illumina encoding starts at + * Q64. The idea here is simple: if we are asked to fix the scores then we just subtract 31 from every quality score. + * Otherwise, we randomly sample reads (for efficiency) and error out if we encounter a qual that's too high. + */ +public class MisencodedBaseQualityReadTransformer extends ReadTransformer { + + private static final int samplingFrequency = 1000; // sample 1 read for every 1000 encountered + private static final int encodingFixValue = 31; // Illumina_64 - PHRED_33 + + private boolean disabled; + private boolean fixQuals; + protected static int currentReadCounter = 0; + + @Override + public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { + fixQuals = engine.getArguments().FIX_MISENCODED_QUALS; + disabled = !fixQuals && engine.getArguments().ALLOW_POTENTIALLY_MISENCODED_QUALS; + + return ReadTransformer.ApplicationTime.ON_INPUT; + } + + @Override + public boolean enabled() { + return !disabled; + } + + @Override + public GATKSAMRecord apply(final GATKSAMRecord read) { + if ( fixQuals ) + return fixMisencodedQuals(read); + + checkForMisencodedQuals(read); + return read; + } + + protected static GATKSAMRecord fixMisencodedQuals(final GATKSAMRecord read) { + final byte[] quals = read.getBaseQualities(); + for ( int i = 0; i < quals.length; i++ ) { + quals[i] -= encodingFixValue; + if ( quals[i] < 0 ) + throw new UserException.BadInput("while fixing mis-encoded base qualities we encountered a read that was correctly encoded; we cannot handle such a mixture of reads so unfortunately the BAM must be fixed with some other tool"); + } + read.setBaseQualities(quals); + return read; + } + + protected static void checkForMisencodedQuals(final GATKSAMRecord read) { + // sample reads randomly for checking + if ( ++currentReadCounter >= samplingFrequency ) { + currentReadCounter = 0; + + final byte[] quals = read.getBaseQualities(); + for ( final byte qual : quals ) { + if ( qual > QualityUtils.MAX_REASONABLE_Q_SCORE ) + throw new UserException.MisencodedBAM(read, "we encountered an extremely high quality score of " + (int)qual); + } + } + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/NullSAMIterator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/NullSAMIterator.java new file mode 100644 index 000000000..ca53fcf1d --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/NullSAMIterator.java @@ -0,0 +1,58 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.iterators; + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; + +import java.util.Iterator; +import java.util.NoSuchElementException; +/** + * User: hanna + * Date: May 19, 2009 + * Time: 6:47:16 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * A placeholder for an iterator with no data. + */ +public class NullSAMIterator implements GATKSAMIterator { + public NullSAMIterator() {} + + public Iterator iterator() { return this; } + public void close() { /* NO-OP */ } + + public boolean hasNext() { return false; } + public SAMRecord next() { throw new NoSuchElementException("No next element is available."); } + public void remove() { throw new UnsupportedOperationException("Cannot remove from a GATKSAMIterator"); } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/PeekingIterator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/PeekingIterator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/PeekingIterator.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/PeekingIterator.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/PositionTrackingIterator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/PositionTrackingIterator.java new file mode 100644 index 000000000..a79d592f7 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/PositionTrackingIterator.java @@ -0,0 +1,106 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.iterators; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.util.CloseableIterator; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; + +/** + * Iterates through a list of elements, tracking the number of elements it has seen. + * @author hanna + * @version 0.1 + */ +public class PositionTrackingIterator implements GATKSAMIterator { + /** + * The iterator being tracked. + */ + private CloseableIterator iterator; + + /** + * Current position within the tracked iterator. + */ + private long position; + + /** + * Retrieves the current position of the iterator. The 'current position' of the iterator is defined as + * the coordinate of the read that will be returned if next() is called. + * @return The current position of the iterator. + */ + public long getPosition() { + return position; + } + + /** + * Create a new iterator wrapping the given position, assuming that the reader is position reads + * into the sequence. + * @param iterator Iterator to wraps. + * @param position Non-negative position where the iterator currently sits. + */ + public PositionTrackingIterator(CloseableIterator iterator, long position ) { + this.iterator = iterator; + this.position = position; + } + + /** + * {@inheritDoc} + */ + public boolean hasNext() { + return iterator.hasNext(); + } + + /** + * Try to get the next read in the list. If a next read is available, increment the position. + * @return next read in the list, if available. + */ + public SAMRecord next() { + try { + return iterator.next(); + } + finally { + position++; + } + } + + /** + * {@inheritDoc} + */ + public GATKSAMIterator iterator() { + return this; + } + + /** + * {@inheritDoc} + */ + public void close() { + iterator.close(); + } + + /** + * {@inheritDoc} + */ + public void remove() { throw new UnsupportedOperationException("Cannot remove from a GATKSAMIterator"); } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/RNAReadTransformer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/RNAReadTransformer.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/RNAReadTransformer.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/RNAReadTransformer.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/ReadFormattingIterator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/ReadFormattingIterator.java new file mode 100644 index 000000000..7a3ca935f --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/ReadFormattingIterator.java @@ -0,0 +1,141 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.iterators; + +import htsjdk.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.utils.sam.AlignmentUtils; + +/** + * An iterator which does post-processing of a read, including potentially wrapping + * the read in something with a compatible interface or replacing the read entirely. + * + * @author mhanna + * @version 0.1 + */ +public class ReadFormattingIterator implements GATKSAMIterator { + /** + * Logger. + */ + final protected static Logger logger = Logger.getLogger(ReadFormattingIterator.class); + + /** + * Iterator to which to pass + */ + private GATKSAMIterator wrappedIterator; + + /** + * True if original base qualities should be used. + */ + private final boolean useOriginalBaseQualities; + + /** + * Positive if there is a default Base Quality value to fill in the reads with. + */ + private final byte defaultBaseQualities; + + + /** + * Decorate the given iterator inside a ReadWrappingIterator. + * @param wrappedIterator iterator + * @param useOriginalBaseQualities true if original base qualities should be used + * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. + */ + public ReadFormattingIterator(GATKSAMIterator wrappedIterator, boolean useOriginalBaseQualities, byte defaultBaseQualities) { + this.wrappedIterator = wrappedIterator; + this.useOriginalBaseQualities = useOriginalBaseQualities; + this.defaultBaseQualities = defaultBaseQualities; + + } + + /** + * Convenience function for use in foreach loops. Dangerous because it does not actually + * reset the iterator. + * @return An iterator through the current data stream. + */ + public GATKSAMIterator iterator() { + // NOTE: this iterator doesn't perform any kind of reset operation; it just returns itself. + // can we do something better? Do we really have to provide support for the Iterable interface? + return this; + } + + /** + * Close this iterator. + */ + public void close() { + wrappedIterator.close(); + } + + /** + * Does the iterator contain more values? + * @return True if there are more left to return, false otherwise. + */ + public boolean hasNext() { + return wrappedIterator.hasNext(); + } + + /** + * Get the next value in the sequence. + * @return Next value in the sequence. By convention, a NoSuchElementException should be thrown if + * no next exists. + */ + public SAMRecord next() { + SAMRecord rec = wrappedIterator.next(); + + // Always consolidate the cigar string into canonical form, collapsing zero-length / repeated cigar elements. + // Downstream code (like LocusIteratorByState) cannot necessarily handle non-consolidated cigar strings. + rec.setCigar(AlignmentUtils.consolidateCigar(rec.getCigar())); + + // if we are using default quals, check if we need them, and add if necessary. + // 1. we need if reads are lacking or have incomplete quality scores + // 2. we add if defaultBaseQualities has a positive value + if (defaultBaseQualities >= 0) { + byte reads [] = rec.getReadBases(); + byte quals [] = rec.getBaseQualities(); + if (quals == null || quals.length < reads.length) { + byte new_quals [] = new byte [reads.length]; + for (int i=0; i iterator() { return this; } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/VerifyingSamIterator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/VerifyingSamIterator.java new file mode 100644 index 000000000..8721779bf --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/VerifyingSamIterator.java @@ -0,0 +1,90 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.iterators; + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; + +import java.util.Iterator; + +/** + * Verifies that the incoming stream of reads is correctly sorted + */ +public class VerifyingSamIterator implements GATKSAMIterator { + GATKSAMIterator it; + SAMRecord last = null; + boolean checkOrderP = true; + + public VerifyingSamIterator(GATKSAMIterator it) { + this.it = it; + } + + public boolean hasNext() { return this.it.hasNext(); } + public SAMRecord next() { + + SAMRecord cur = it.next(); + if ( last != null ) + verifyRecord(last, cur); + if ( ! cur.getReadUnmappedFlag() ) + last = cur; + return cur; + } + + private void verifyRecord( final SAMRecord last, final SAMRecord cur ) { + if ( checkOrderP && isOutOfOrder(last, cur) ) { + this.last = null; + throw new UserException.MissortedBAM(String.format("reads are out of order:%nlast:%n%s%ncurrent:%n%s%n", last.format(), cur.format()) ); + } + } + + private boolean isOutOfOrder( final SAMRecord last, final SAMRecord cur ) { + if ( last == null || cur.getReadUnmappedFlag() ) + return false; + else { + if(last.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX || last.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) + throw new UserException.MalformedBAM(last,String.format("read %s has inconsistent mapping information.",last.format())); + if(cur.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX || cur.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) + throw new UserException.MalformedBAM(last,String.format("read %s has inconsistent mapping information.",cur.format())); + + return (last.getReferenceIndex() > cur.getReferenceIndex()) || + (last.getReferenceIndex().equals(cur.getReferenceIndex()) && + last.getAlignmentStart() > cur.getAlignmentStart()); + } + } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } + + public void close() { + it.close(); + } + + public Iterator iterator() { + return this; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/package-info.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/package-info.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/package-info.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReport.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReport.java new file mode 100644 index 000000000..3bd174442 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReport.java @@ -0,0 +1,786 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.phonehome; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.engine.CommandLineGATK; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.engine.crypt.CryptUtils; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.io.IOUtils; +import org.broadinstitute.gatk.utils.io.Resource; +import org.broadinstitute.gatk.utils.threading.ThreadEfficiencyMonitor; +import org.jets3t.service.S3Service; +import org.jets3t.service.S3ServiceException; +import org.jets3t.service.impl.rest.httpclient.RestS3Service; +import org.jets3t.service.model.S3Object; +import org.jets3t.service.security.AWSCredentials; +import org.simpleframework.xml.Element; +import org.simpleframework.xml.Serializer; +import org.simpleframework.xml.core.Persister; + +import java.io.*; +import java.security.NoSuchAlgorithmException; +import java.security.PublicKey; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Arrays; +import java.util.Date; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + + +/** + * A detailed description of a GATK run, and error if applicable. Simply create a GATKRunReport + * with the constructor, providing the walker that was run and the fully instantiated GenomeAnalysisEngine + * after the run finishes and the GATKRunReport will collect all of the report information + * into this object. Call postReport to write out the report, as an XML document, to either STDOUT, + * a file (in which case the output is gzipped), or with no arguments the report will be posted to the + * GATK run report database. + * + * @author depristo + * @since 2010 + */ +public class GATKRunReport { + protected static final String REPORT_BUCKET_NAME = "broad.gsa.gatk.run.reports"; + protected static final String TEST_REPORT_BUCKET_NAME = "broad.gsa.gatk.run.reports.test"; + protected final static String AWS_ACCESS_KEY_MD5 = "34d4a26eb2062b3f06e833b28f9a38c6"; + protected final static String AWS_SECRET_KEY_MD5 = "83f2332eec99ef1d7425d5dc5d4b514a"; + + private static final DateFormat DATE_FORMAT = new SimpleDateFormat("yyyy/MM/dd HH.mm.ss"); + + /** + * our log + */ + protected static final Logger logger = Logger.getLogger(GATKRunReport.class); + + /** + * Default value for the number of milliseconds before an S3 put operation is timed-out. + * Can be overridden via a constructor argument. + */ + private static final long S3_DEFAULT_PUT_TIME_OUT_IN_MILLISECONDS = 30 * 1000; + + /** + * Number of milliseconds before an S3 put operation is timed-out. + */ + private long s3PutTimeOutInMilliseconds = S3_DEFAULT_PUT_TIME_OUT_IN_MILLISECONDS; + + // ----------------------------------------------------------------- + // elements captured for the report + // ----------------------------------------------------------------- + + @Element(required = false, name = "id") + private String id; + + @Element(required = false, name = "exception") + private GATKRunReportException mException; + + @Element(required = true, name = "start-time") + private String startTime = "ND"; + + @Element(required = true, name = "end-time") + private String endTime; + + @Element(required = true, name = "run-time") + private long runTime = 0; + + @Element(required = true, name = "walker-name") + private String walkerName; + + @Element(required = true, name = "svn-version") + private String svnVersion; + + @Element(required = true, name = "total-memory") + private long totalMemory; + + @Element(required = true, name = "max-memory") + private long maxMemory; + + @Element(required = true, name = "user-name") + private String userName; + + @Element(required = true, name = "host-name") + private String hostName; + + @Element(required = true, name = "java") + private String javaVersion; + + @Element(required = true, name = "machine") + private String machine; + + @Element(required = true, name = "iterations") + private long nIterations; + + @Element(required = true, name = "tag") + private String tag; + + @Element(required = true, name = "num-threads") + private int numThreads; + @Element(required = true, name = "percent-time-running") + private String percentTimeRunning; + @Element(required = true, name = "percent-time-waiting") + private String percentTimeWaiting; + @Element(required = true, name = "percent-time-blocking") + private String percentTimeBlocking; + @Element(required = true, name = "percent-time-waiting-for-io") + private String percentTimeWaitingForIO; + + /** The error message, if one occurred, or null if none did */ + public String errorMessage = null; + /** The error that occurred, if one did, or null if none did */ + public Throwable errorThrown = null; + + /** + * How should the GATK report its usage? + */ + public enum PhoneHomeOption { + /** Disable phone home */ + NO_ET, + /** Forces the report to go to S3 */ + AWS, + /** Force output to STDOUT. For debugging only */ + STDOUT + } + + /** + * To allow us to deserial reports from XML + */ + private GATKRunReport() { } + + /** + * Read a GATKRunReport from the serialized XML representation in String reportAsXML + * @param stream an input stream containing a serialized XML report + * @return a reconstituted GATKRunReport from reportAsXML + * @throws Exception if parsing fails for any reason + */ + @Ensures("result != null") + protected static GATKRunReport deserializeReport(final InputStream stream) throws Exception { + final Serializer serializer = new Persister(); + return serializer.read(GATKRunReport.class, stream); + } + + /** + * Create a new GATKRunReport from a report on S3 + * + * Assumes that s3Object has already been written to S3, and this function merely + * fetches it from S3 and deserializes it. The access keys must have permission to + * GetObject from S3. + * + * @param downloaderAccessKey AWS access key with permission to GetObject from bucketName + * @param downloaderSecretKey AWS secret key with permission to GetObject from bucketName + * @param bucketName the name of the bucket holding the report + * @param s3Object the s3Object we wrote to S3 in bucketName that we want to get back and decode + * @return a deserialized report derived from s3://bucketName/s3Object.getName() + * @throws Exception + */ + @Ensures("result != null") + protected static GATKRunReport deserializeReport(final String downloaderAccessKey, + final String downloaderSecretKey, + final String bucketName, + final S3Object s3Object) throws Exception { + final S3Service s3Service = initializeAWSService(downloaderAccessKey, downloaderSecretKey); + + // Retrieve the whole data object we created previously + final S3Object objectComplete = s3Service.getObject(bucketName, s3Object.getName()); + + // Read the data from the object's DataInputStream using a loop, and print it out. + return deserializeReport(new GZIPInputStream(objectComplete.getDataInputStream())); + } + + /** + * Create a new RunReport and population all of the fields with values from the walker and engine. + * Allows the S3 put timeout to be explicitly set. + * + * @param walker the GATK walker that we ran + * @param e the exception caused by running this walker, or null if we completed successfully + * @param engine the GAE we used to run the walker, so we can fetch runtime, args, etc + * @param type the GATK phone home setting + * @param s3PutTimeOutInMilliseconds number of milliseconds to wait before timing out an S3 put operation + */ + public GATKRunReport(final Walker walker, final Exception e, final GenomeAnalysisEngine engine, final PhoneHomeOption type, + final long s3PutTimeOutInMilliseconds) { + this(walker, e, engine, type); + this.s3PutTimeOutInMilliseconds = s3PutTimeOutInMilliseconds; + } + + /** + * Create a new RunReport and population all of the fields with values from the walker and engine. + * Leaves the S3 put timeout set to the default value of S3_DEFAULT_PUT_TIME_OUT_IN_MILLISECONDS. + * + * @param walker the GATK walker that we ran + * @param e the exception caused by running this walker, or null if we completed successfully + * @param engine the GAE we used to run the walker, so we can fetch runtime, args, etc + * @param type the GATK phone home setting + */ + public GATKRunReport(final Walker walker, final Exception e, final GenomeAnalysisEngine engine, final PhoneHomeOption type) { + if ( type == PhoneHomeOption.NO_ET ) + throw new ReviewedGATKException("Trying to create a run report when type is NO_ET!"); + + logger.debug("Aggregating data for run report"); + + // what did we run? + id = org.apache.commons.lang.RandomStringUtils.randomAlphanumeric(32); + walkerName = engine.getWalkerName(walker.getClass()); + svnVersion = CommandLineGATK.getVersionNumber(); + + // runtime performance metrics + Date end = new java.util.Date(); + endTime = DATE_FORMAT.format(end); + if ( engine.getStartTime() != null ) { // made it this far during initialization + startTime = DATE_FORMAT.format(engine.getStartTime()); + runTime = (end.getTime() - engine.getStartTime().getTime()) / 1000L; // difference in seconds + } + + // deal with memory usage + Runtime.getRuntime().gc(); // call GC so totalMemory is ~ used memory + maxMemory = Runtime.getRuntime().maxMemory(); + totalMemory = Runtime.getRuntime().totalMemory(); + + // we can only do some operations if an error hasn't occurred + if ( engine.getCumulativeMetrics() != null ) { + // it's possible we aborted so early that these data structures arent initialized + nIterations = engine.getCumulativeMetrics().getNumIterations(); + } + + tag = engine.getArguments().tag; + + // user and hostname -- information about the runner of the GATK + userName = System.getProperty("user.name"); + hostName = Utils.resolveHostname(); + + // basic java information + javaVersion = Utils.join("-", Arrays.asList(System.getProperty("java.vendor"), System.getProperty("java.version"))); + machine = Utils.join("-", Arrays.asList(System.getProperty("os.name"), System.getProperty("os.arch"))); + + // if there was an exception, capture it + this.mException = e == null ? null : new GATKRunReportException(e); + + numThreads = engine.getTotalNumberOfThreads(); + percentTimeRunning = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.USER_CPU); + percentTimeBlocking = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.BLOCKING); + percentTimeWaiting = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.WAITING); + percentTimeWaitingForIO = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.WAITING_FOR_IO); + } + + /** + * Get the random alpha-numeric ID of this GATKRunReport + * @return a non-null string ID + */ + @Ensures("result != null") + public String getID() { + return id; + } + + /** + * Return a string representing the percent of time the GATK spent in state, if possible. Otherwise return NA + * + * @param engine the GATK engine whose threading efficiency info we will use + * @param state the state whose occupancy we wish to know + * @return a string representation of the percent occupancy of state, or NA is not possible + */ + @Requires({"engine != null", "state != null"}) + @Ensures("result != null") + private String getThreadEfficiencyPercent(final GenomeAnalysisEngine engine, final ThreadEfficiencyMonitor.State state) { + final ThreadEfficiencyMonitor tem = engine.getThreadEfficiencyMonitor(); + return tem == null ? "NA" : String.format("%.2f", tem.getStatePercent(state)); + } + + /** + * Get a filename (no path) appropriate for this report + * + * @return a non-null string filename + */ + @Ensures("result != null") + protected String getReportFileName() { + return getID() + ".report.xml.gz"; + } + + // --------------------------------------------------------------------------- + // + // Main public interface method for posting reports + // + // --------------------------------------------------------------------------- + + /** + * Post this GATK report to the destination implied by the PhoneHomeOption type + * + * Guaranteed to never throw an exception (exception noted below) and to return + * with a reasonable (~10 seconds) time regardless of successful writing of the report. + * + * @throws IllegalArgumentException if type == null + * @param type the type of phoning home we want to do + * @return true if a report was successfully written, false otherwise + */ + public boolean postReport(final PhoneHomeOption type) { + if ( type == null ) throw new IllegalArgumentException("type cannot be null"); + + logger.debug("Posting report of type " + type); + switch (type) { + case NO_ET: // don't do anything + return false; + case AWS: + wentToAWS = true; + return postReportToAWSS3() != null; + case STDOUT: + return postReportToStream(System.out); + default: + exceptDuringRunReport("BUG: unexpected PhoneHomeOption "); + return false; + } + } + + // --------------------------------------------------------------------------- + // + // Code for sending reports to local files + // + // --------------------------------------------------------------------------- + + /** + * Write an XML representation of this report to the stream, throwing a GATKException if the marshalling + * fails for any reason. + * + * @param stream an output stream to write the report to + */ + @Requires("stream != null") + protected boolean postReportToStream(final OutputStream stream) { + final Serializer serializer = new Persister(); + try { + serializer.write(this, stream); + return true; + } catch (Exception e) { + return false; + } + } + + // --------------------------------------------------------------------------- + // + // Code for sending reports to s3 + // + // --------------------------------------------------------------------------- + + /** + * Get the name of the S3 bucket where we should upload this report + * + * @return the string name of the s3 bucket + */ + @Ensures("result != null") + protected String getS3ReportBucket() { + return s3ReportBucket; + } + + /** + * Decrypts encrypted AWS key from encryptedKeySource + * @param encryptedKeySource a file containing an encrypted AWS key + * @return a decrypted AWS key as a String + */ + @Ensures("result != null") + public static String decryptAWSKey(final File encryptedKeySource) throws FileNotFoundException { + if ( encryptedKeySource == null ) throw new IllegalArgumentException("encryptedKeySource cannot be null"); + return decryptAWSKey(new FileInputStream(encryptedKeySource)); + } + + /** + * @see #decryptAWSKey(java.io.File) but with input from an inputstream + */ + @Requires("encryptedKeySource != null") + @Ensures("result != null") + private static String decryptAWSKey(final InputStream encryptedKeySource) { + final PublicKey key = CryptUtils.loadGATKDistributedPublicKey(); + final byte[] fromDisk = IOUtils.readStreamIntoByteArray(encryptedKeySource); + final byte[] decrypted = CryptUtils.decryptData(fromDisk, key); + return new String(decrypted); + } + + /** + * Get the decrypted AWS key sorted in the resource directories of name + * @param name the name of the file containing the needed AWS key + * @return a non-null GATK + */ + @Requires("name != null") + @Ensures("result != null") + private static String getAWSKey(final String name) { + final Resource resource = new Resource(name, GATKRunReport.class); + return decryptAWSKey(resource.getResourceContentsAsStream()); + } + + /** + * Get the AWS access key for the GATK user + * @return a non-null AWS access key for the GATK user + */ + @Ensures("result != null") + protected static String getAWSUploadAccessKey() { + return getAWSKey("resources/GATK_AWS_access.key"); + } + + /** + * Get the AWS secret key for the GATK user + * @return a non-null AWS secret key for the GATK user + */ + @Ensures("result != null") + protected static String getAWSUploadSecretKey() { + return getAWSKey("resources/GATK_AWS_secret.key"); + } + + /** + * Check that the AWS keys can be decrypted and are what we expect them to be + * + * @throws ReviewedGATKException if anything goes wrong + */ + public static void checkAWSAreValid() { + try { + final String accessKeyMD5 = Utils.calcMD5(getAWSUploadAccessKey()); + final String secretKeyMD5 = Utils.calcMD5(getAWSUploadSecretKey()); + + if ( ! AWS_ACCESS_KEY_MD5.equals(accessKeyMD5) ) { + throw new ReviewedGATKException("Invalid AWS access key found, expected MD5 " + AWS_ACCESS_KEY_MD5 + " but got " + accessKeyMD5); + } + if ( ! AWS_SECRET_KEY_MD5.equals(secretKeyMD5) ) { + throw new ReviewedGATKException("Invalid AWS secret key found, expected MD5 " + AWS_SECRET_KEY_MD5 + " but got " + secretKeyMD5); + } + + } catch ( Exception e ) { + throw new ReviewedGATKException("Couldn't decrypt AWS keys, something is wrong with the GATK distribution"); + } + } + + /** + * Get an initialized S3Service for use in communicating with AWS/s3 + * + * @param awsAccessKey our AWS access key to use + * @param awsSecretKey our AWS secret key to use + * @return an initialized S3Service object that can be immediately used to interact with S3 + * @throws S3ServiceException + */ + @Requires({"awsAccessKey != null", "awsSecretKey != null"}) + @Ensures("result != null") + protected static S3Service initializeAWSService(final String awsAccessKey, final String awsSecretKey) throws S3ServiceException { + // To communicate with S3, create a class that implements an S3Service. We will use the REST/HTTP + // implementation based on HttpClient, as this is the most robust implementation provided with JetS3t. + final AWSCredentials awsCredentials = new AWSCredentials(awsAccessKey, awsSecretKey); + return new RestS3Service(awsCredentials); + } + + /** + * A runnable that pushes this GATKReport up to s3. + * + * Should be run in a separate thread so we can time it out if something is taking too long + */ + private class S3PutRunnable implements Runnable { + /** Was the upload operation successful? */ + public final AtomicBoolean isSuccess; + /** The name of this report */ + private final String filename; + /** The contents of this report */ + private final byte[] contents; + + /** The s3Object that we created to upload, or null if it failed */ + public S3Object s3Object = null; + + @Requires({"filename != null", "contents != null"}) + public S3PutRunnable(final String filename, final byte[] contents){ + this.isSuccess = new AtomicBoolean(); + this.filename = filename; + this.contents = contents; + } + + public void run() { + try { + switch ( awsMode ) { + case FAIL_WITH_EXCEPTION: + throw new IllegalStateException("We are throwing an exception for testing purposes"); + case TIMEOUT: + try { + Thread.sleep(s3PutTimeOutInMilliseconds * 100); + } catch ( InterruptedException e ) { + // supposed to be empty + } + break; + case NORMAL: + // IAM GATK user credentials -- only right is to PutObject into broad.gsa.gatk.run.reports bucket + final S3Service s3Service = initializeAWSService(getAWSUploadAccessKey(), getAWSUploadSecretKey()); + + // Create an S3Object based on a file, with Content-Length set automatically and + // Content-Type set based on the file's extension (using the Mimetypes utility class) + final S3Object fileObject = new S3Object(filename, contents); + //logger.info("Created S3Object" + fileObject); + //logger.info("Uploading " + localFile + " to AWS bucket"); + s3Object = s3Service.putObject(getS3ReportBucket(), fileObject); + isSuccess.set(true); + break; + default: + throw new IllegalStateException("Unexpected AWS exception"); + } + } catch ( S3ServiceException e ) { + exceptDuringRunReport("S3 exception occurred", e); + } catch ( NoSuchAlgorithmException e ) { + exceptDuringRunReport("Couldn't calculate MD5", e); + } catch ( IOException e ) { + exceptDuringRunReport("Couldn't read report file", e); + } catch ( Exception e ) { + exceptDuringRunReport("An unexpected exception occurred during posting", e); + } + } + } + + /** + * Post this GATK report to the AWS s3 GATK_Run_Report log + * + * @return the s3Object pointing to our pushed report, or null if we failed to push + */ + protected S3Object postReportToAWSS3() { + // modifying example code from http://jets3t.s3.amazonaws.com/toolkit/code-samples.html + this.hostName = Utils.resolveHostname(); // we want to fill in the host name + final String key = getReportFileName(); + logger.debug("Generating GATK report to AWS S3 with key " + key); + + try { + // create an byte output stream so we can capture the output as a byte[] + final ByteArrayOutputStream byteStream = new ByteArrayOutputStream(8096); + final OutputStream outputStream = new GZIPOutputStream(byteStream); + postReportToStream(outputStream); + outputStream.close(); + final byte[] report = byteStream.toByteArray(); + + // stop us from printing the annoying, and meaningless, mime types warning + final Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class); + mimeTypeLogger.setLevel(Level.FATAL); + + // Set the S3 upload on its own thread with timeout: + final S3PutRunnable s3run = new S3PutRunnable(key,report); + final Thread s3thread = new Thread(s3run); + s3thread.setDaemon(true); + s3thread.setName("S3Put-Thread"); + s3thread.start(); + + s3thread.join(s3PutTimeOutInMilliseconds); + + if(s3thread.isAlive()){ + s3thread.interrupt(); + exceptDuringRunReport("Run statistics report upload to AWS S3 timed-out"); + } else if(s3run.isSuccess.get()) { + logger.info("Uploaded run statistics report to AWS S3"); + logger.debug("Uploaded to AWS: " + s3run.s3Object); + return s3run.s3Object; + } else { + // an exception occurred, the thread should have already invoked the exceptDuringRunReport function + } + } catch ( IOException e ) { + exceptDuringRunReport("Couldn't read report file", e); + } catch ( InterruptedException e) { + exceptDuringRunReport("Run statistics report upload interrupted", e); + } + + return null; + } + + // --------------------------------------------------------------------------- + // + // Error handling code + // + // --------------------------------------------------------------------------- + + /** + * Note that an exception occurred during creating or writing this report + * @param msg the message to print + * @param e the exception that occurred + */ + @Ensures("exceptionOccurredDuringPost()") + private void exceptDuringRunReport(final String msg, final Throwable e) { + this.errorMessage = msg; + this.errorThrown = e; + logger.debug("A problem occurred during GATK run reporting [*** everything is fine, but no report could be generated; please do not post this to the support forum ***]. Message is: " + msg + ". Error message is: " + e.getMessage()); + } + + /** + * Note that an exception occurred during creating or writing this report + * @param msg the message to print + */ + @Ensures("exceptionOccurredDuringPost()") + private void exceptDuringRunReport(final String msg) { + this.errorMessage = msg; + logger.debug("A problem occurred during GATK run reporting [*** everything is fine, but no report could be generated; please do not post this to the support forum ***]. Message is " + msg); + } + + /** + * Did an error occur during the posting of this run report? + * @return true if so, false if not + */ + public boolean exceptionOccurredDuringPost() { + return getErrorMessage() != null; + } + + /** + * If an error occurred during posting of this report, retrieve the message of the error that occurred, or null if + * no error occurred + * @return a string describing the error that occurred, or null if none did + */ + public String getErrorMessage() { + return errorMessage; + } + + /** + * Get the throwable that caused the exception during posting of this message, or null if none was available + * + * Note that getting a null valuable from this function doesn't not imply that no error occurred. Some + * errors that occurred many not have generated a throwable. + * + * @return the Throwable that caused the error, or null if no error occurred or was not caused by a throwable + */ + public Throwable getErrorThrown() { + return errorThrown; + } + + /** + * Helper method to format the exception that occurred during posting, or a string saying none occurred + * @return a non-null string + */ + @Ensures("result != null") + protected String formatError() { + return exceptionOccurredDuringPost() + ? String.format("Exception message=%s with cause=%s", getErrorMessage(), getErrorThrown()) + : "No exception occurred"; + } + + // --------------------------------------------------------------------------- + // + // Equals and hashcode -- purely for comparing reports for testing + // + // --------------------------------------------------------------------------- + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + GATKRunReport that = (GATKRunReport) o; + + if (maxMemory != that.maxMemory) return false; + if (nIterations != that.nIterations) return false; + if (numThreads != that.numThreads) return false; + if (runTime != that.runTime) return false; + if (totalMemory != that.totalMemory) return false; + if (endTime != null ? !endTime.equals(that.endTime) : that.endTime != null) return false; + if (hostName != null ? !hostName.equals(that.hostName) : that.hostName != null) return false; + if (id != null ? !id.equals(that.id) : that.id != null) return false; + if (javaVersion != null ? !javaVersion.equals(that.javaVersion) : that.javaVersion != null) return false; + if (mException != null ? !mException.equals(that.mException) : that.mException != null) return false; + if (machine != null ? !machine.equals(that.machine) : that.machine != null) return false; + if (percentTimeBlocking != null ? !percentTimeBlocking.equals(that.percentTimeBlocking) : that.percentTimeBlocking != null) + return false; + if (percentTimeRunning != null ? !percentTimeRunning.equals(that.percentTimeRunning) : that.percentTimeRunning != null) + return false; + if (percentTimeWaiting != null ? !percentTimeWaiting.equals(that.percentTimeWaiting) : that.percentTimeWaiting != null) + return false; + if (percentTimeWaitingForIO != null ? !percentTimeWaitingForIO.equals(that.percentTimeWaitingForIO) : that.percentTimeWaitingForIO != null) + return false; + if (startTime != null ? !startTime.equals(that.startTime) : that.startTime != null) return false; + if (svnVersion != null ? !svnVersion.equals(that.svnVersion) : that.svnVersion != null) return false; + if (tag != null ? !tag.equals(that.tag) : that.tag != null) return false; + if (userName != null ? !userName.equals(that.userName) : that.userName != null) return false; + if (walkerName != null ? !walkerName.equals(that.walkerName) : that.walkerName != null) return false; + + return true; + } + + @Override + public int hashCode() { + int result = id != null ? id.hashCode() : 0; + result = 31 * result + (mException != null ? mException.hashCode() : 0); + result = 31 * result + (startTime != null ? startTime.hashCode() : 0); + result = 31 * result + (endTime != null ? endTime.hashCode() : 0); + result = 31 * result + (int) (runTime ^ (runTime >>> 32)); + result = 31 * result + (walkerName != null ? walkerName.hashCode() : 0); + result = 31 * result + (svnVersion != null ? svnVersion.hashCode() : 0); + result = 31 * result + (int) (totalMemory ^ (totalMemory >>> 32)); + result = 31 * result + (int) (maxMemory ^ (maxMemory >>> 32)); + result = 31 * result + (userName != null ? userName.hashCode() : 0); + result = 31 * result + (hostName != null ? hostName.hashCode() : 0); + result = 31 * result + (javaVersion != null ? javaVersion.hashCode() : 0); + result = 31 * result + (machine != null ? machine.hashCode() : 0); + result = 31 * result + (int) (nIterations ^ (nIterations >>> 32)); + result = 31 * result + (tag != null ? tag.hashCode() : 0); + result = 31 * result + numThreads; + result = 31 * result + (percentTimeRunning != null ? percentTimeRunning.hashCode() : 0); + result = 31 * result + (percentTimeWaiting != null ? percentTimeWaiting.hashCode() : 0); + result = 31 * result + (percentTimeBlocking != null ? percentTimeBlocking.hashCode() : 0); + result = 31 * result + (percentTimeWaitingForIO != null ? percentTimeWaitingForIO.hashCode() : 0); + return result; + } + + // --------------------------------------------------------------------------- + // + // Code specifically for testing the GATKRunReport + // + // --------------------------------------------------------------------------- + + /** + * Enum specifying how the S3 uploader should behave. Must be normal by default. Purely for testing purposes + */ + protected enum AWSMode { + NORMAL, // write normally to AWS + FAIL_WITH_EXCEPTION, // artificially fail during writing + TIMEOUT // sleep, so we time out + } + /** Our AWS mode */ + private AWSMode awsMode = AWSMode.NORMAL; + /** The bucket were we send the GATK report on AWS/s3 */ + private String s3ReportBucket = REPORT_BUCKET_NAME; + /** Did we send the report to AWS? */ + private boolean wentToAWS = false; + + /** + * Send the report to the AWS test bucket -- for testing only + */ + protected void sendAWSToTestBucket() { + s3ReportBucket = TEST_REPORT_BUCKET_NAME; + } + + /** + * Has the report been written to AWS? + * + * Does not imply anything about the success of the send, just that it was attempted + * + * @return true if the report has been sent to AWS, false otherwise + */ + protected boolean wentToAWS() { + return wentToAWS; + } + + /** + * Purely for testing purposes. Tells the AWS uploader whether to actually upload or simulate errors + * @param mode what we want to do + */ + @Requires("mode != null") + protected void setAwsMode(final AWSMode mode) { + this.awsMode = mode; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReportException.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReportException.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReportException.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReportException.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRArgumentSet.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRArgumentSet.java new file mode 100644 index 000000000..497eafe68 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRArgumentSet.java @@ -0,0 +1,85 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; + +import java.io.File; + +public class BQSRArgumentSet { + // declare public, STL-style for easier and more efficient access: + private File BQSR_RECAL_FILE; + private int quantizationLevels; + private boolean disableIndelQuals; + private boolean emitOriginalQuals; + private int PRESERVE_QSCORES_LESS_THAN; + private double globalQScorePrior; + + public BQSRArgumentSet(final GATKArgumentCollection args) { + this.BQSR_RECAL_FILE = args.BQSR_RECAL_FILE; + this.quantizationLevels = args.quantizationLevels; + this.disableIndelQuals = args.disableIndelQuals; + this.emitOriginalQuals = args.emitOriginalQuals; + this.PRESERVE_QSCORES_LESS_THAN = args.PRESERVE_QSCORES_LESS_THAN; + this.globalQScorePrior = args.globalQScorePrior; + } + + public File getRecalFile() { return BQSR_RECAL_FILE; } + + public int getQuantizationLevels() { return quantizationLevels; } + + public boolean shouldDisableIndelQuals() { return disableIndelQuals; } + + public boolean shouldEmitOriginalQuals() { return emitOriginalQuals; } + + public int getPreserveQscoresLessThan() { return PRESERVE_QSCORES_LESS_THAN; } + + public double getGlobalQScorePrior() { return globalQScorePrior; } + + public void setRecalFile(final File BQSR_RECAL_FILE) { + this.BQSR_RECAL_FILE = BQSR_RECAL_FILE; + } + + public void setQuantizationLevels(final int quantizationLevels) { + this.quantizationLevels = quantizationLevels; + } + + public void setDisableIndelQuals(final boolean disableIndelQuals) { + this.disableIndelQuals = disableIndelQuals; + } + + public void setEmitOriginalQuals(final boolean emitOriginalQuals) { + this.emitOriginalQuals = emitOriginalQuals; + } + + public void setPreserveQscoresLessThan(final int PRESERVE_QSCORES_LESS_THAN) { + this.PRESERVE_QSCORES_LESS_THAN = PRESERVE_QSCORES_LESS_THAN; + } + + public void setGlobalQScorePrior(final double globalQScorePrior) { + this.globalQScorePrior = globalQScorePrior; + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRMode.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRMode.java new file mode 100644 index 000000000..de6500e19 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRMode.java @@ -0,0 +1,55 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; + +import java.lang.annotation.*; + +/** + * User: hanna + * Date: May 14, 2009 + * Time: 1:51:22 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * Allows the walker to indicate what type of data it wants to consume. + */ + +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface BQSRMode { + public abstract ReadTransformer.ApplicationTime ApplicationTime() default ReadTransformer.ApplicationTime.ON_INPUT; +} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/resourcemanagement/ThreadAllocation.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/resourcemanagement/ThreadAllocation.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/resourcemanagement/ThreadAllocation.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/resourcemanagement/ThreadAllocation.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/Affection.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/Affection.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/Affection.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/Affection.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/Gender.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/Gender.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/Gender.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/Gender.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/MendelianViolation.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/MendelianViolation.java new file mode 100644 index 000000000..a37eb8d88 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/MendelianViolation.java @@ -0,0 +1,461 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.samples; + +import org.broadinstitute.gatk.engine.samples.Sample; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeType; +import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.utils.MathUtils; + +import java.util.*; + +/** + * User: carneiro / lfran + * Date: 3/9/11 + * Time: 12:38 PM + * + * Class for the identification and tracking of mendelian violation. It can be used in 2 distinct ways: + * - Either using an instance of the MendelianViolation class to track mendelian violations for each of the families while + * walking over the variants + * - Or using the static methods to directly get information about mendelian violation in a family at a given locus + * + */ +public class MendelianViolation { + //List of families with violations + private List violationFamilies; + + //Call information + private int nocall = 0; + private int familyCalled = 0; + private int varFamilyCalled = 0; + private int lowQual = 0; + + private boolean allCalledOnly = true; + + //Stores occurrences of inheritance + private EnumMap>> inheritance; + + private int violations_total=0; + + private double minGenotypeQuality; + + private boolean abortOnSampleNotFound; + + //Number of families with genotype information for all members + public int getFamilyCalledCount(){ + return familyCalled; + } + + //Number of families with genotype information for all members + public int getVarFamilyCalledCount(){ + return varFamilyCalled; + } + + //Number of families missing genotypes for one or more of their members + public int getFamilyNoCallCount(){ + return nocall; + } + + //Number of families with genotypes below the set quality threshold + public int getFamilyLowQualsCount(){ + return lowQual; + } + + public int getViolationsCount(){ + return violations_total; + } + + //Count of alt alleles inherited from het parents (no violation) + public int getParentHetInheritedVar(){ + return getParentsHetHetInheritedVar() + getParentsRefHetInheritedVar() + getParentsVarHetInheritedVar(); + } + + //Count of ref alleles inherited from het parents (no violation) + public int getParentHetInheritedRef(){ + return getParentsHetHetInheritedRef() + getParentsRefHetInheritedRef() + getParentsVarHetInheritedRef(); + } + + //Count of HomRef/HomRef/HomRef trios + public int getRefRefRef(){ + return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF); + } + + //Count of HomVar/HomVar/HomVar trios + public int getVarVarVar(){ + return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR); + } + + //Count of HomRef/HomVar/Het trios + public int getRefVarHet(){ + return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR).get(GenotypeType.HET) + + inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF).get(GenotypeType.HET); + } + + //Count of Het/Het/Het trios + public int getHetHetHet(){ + return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HET); + } + + //Count of Het/Het/HomRef trios + public int getHetHetHomRef(){ + return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_REF); + } + + //Count of Het/Het/HomVar trios + public int getHetHetHomVar(){ + return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_VAR); + } + + //Count of ref alleles inherited from Het/Het parents (no violation) + public int getParentsHetHetInheritedRef(){ + return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HET) + + 2*inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_REF); + //return parentsHetHet_childRef; + } + + //Count of var alleles inherited from Het/Het parents (no violation) + public int getParentsHetHetInheritedVar(){ + return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HET) + + 2*inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_VAR); + //return parentsHetHet_childVar; + } + + //Count of ref alleles inherited from HomRef/Het parents (no violation) + public int getParentsRefHetInheritedRef(){ + return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HET).get(GenotypeType.HOM_REF) + + inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF); + //return parentsHomRefHet_childRef; + } + + //Count of var alleles inherited from HomRef/Het parents (no violation) + public int getParentsRefHetInheritedVar(){ + return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HET).get(GenotypeType.HET) + + inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_REF).get(GenotypeType.HET); + //return parentsHomRefHet_childVar; + } + + //Count of ref alleles inherited from HomVar/Het parents (no violation) + public int getParentsVarHetInheritedRef(){ + return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HET).get(GenotypeType.HET) + + inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_VAR).get(GenotypeType.HET); + //return parentsHomVarHet_childRef; + } + + //Count of var alleles inherited from HomVar/Het parents (no violation) + public int getParentsVarHetInheritedVar(){ + return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HET).get(GenotypeType.HOM_VAR) + + inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR); + //return parentsHomVarHet_childVar; + } + + //Count of violations of the type HOM_REF/HOM_REF -> HOM_VAR + public int getParentsRefRefChildVar(){ + return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR); + } + + //Count of violations of the type HOM_REF/HOM_REF -> HET + public int getParentsRefRefChildHet(){ + return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF).get(GenotypeType.HET); + } + + //Count of violations of the type HOM_REF/HET -> HOM_VAR + public int getParentsRefHetChildVar(){ + return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HET).get(GenotypeType.HOM_VAR) + + inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR); + } + + //Count of violations of the type HOM_REF/HOM_VAR -> HOM_VAR + public int getParentsRefVarChildVar(){ + return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR) + + inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR); + } + + //Count of violations of the type HOM_REF/HOM_VAR -> HOM_REF + public int getParentsRefVarChildRef(){ + return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF) + + inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF); + } + + //Count of violations of the type HOM_VAR/HET -> HOM_REF + public int getParentsVarHetChildRef(){ + return inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF) + + inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HET).get(GenotypeType.HOM_REF); + } + + //Count of violations of the type HOM_VAR/HOM_VAR -> HOM_REF + public int getParentsVarVarChildRef(){ + return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF); + } + + //Count of violations of the type HOM_VAR/HOM_VAR -> HET + public int getParentsVarVarChildHet(){ + return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR).get(GenotypeType.HET); + } + + + //Count of violations of the type HOM_VAR/? -> HOM_REF + public int getParentVarChildRef(){ + return getParentsRefVarChildRef() + getParentsVarHetChildRef() +getParentsVarVarChildRef(); + } + + //Count of violations of the type HOM_REF/? -> HOM_VAR + public int getParentRefChildVar(){ + return getParentsRefVarChildVar() + getParentsRefHetChildVar() +getParentsRefRefChildVar(); + } + + //Returns a String containing all trios where a Mendelian violation was observed. + //The String is formatted "mom1+dad1=child1,mom2+dad2=child2,..." + public String getViolationFamiliesString(){ + if(violationFamilies.isEmpty()) + return ""; + + Iterator it = violationFamilies.iterator(); + String violationFams = it.next(); + while(it.hasNext()){ + violationFams += ","+it.next(); + } + return violationFams; + } + + public List getViolationFamilies(){ + return violationFamilies; + } + + static final int[] mvOffsets = new int[] { 1,2,5,6,8,11,15,18,20,21,24,25 }; + static final int[] nonMVOffsets = new int[]{ 0,3,4,7,9,10,12,13,14,16,17,19,22,23,26 }; + + public double getMinGenotypeQuality() { + return minGenotypeQuality; + } + + /** + * Constructor + * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation + * + */ + public MendelianViolation(double minGenotypeQualityP) { + this(minGenotypeQualityP,true); + } + + /** + * Constructor + * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation + * @param abortOnSampleNotFound - Whether to stop execution if a family is passed but no relevant genotypes are found. If false, then the family is ignored. + */ + public MendelianViolation(double minGenotypeQualityP, boolean abortOnSampleNotFound) { + minGenotypeQuality = minGenotypeQualityP; + this.abortOnSampleNotFound = abortOnSampleNotFound; + violationFamilies = new ArrayList(); + createInheritanceMap(); + } + + /** + * Constructor + * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation + * @param abortOnSampleNotFound - Whether to stop execution if a family is passed but no relevant genotypes are found. If false, then the family is ignored. + * @param completeTriosOnly - whether only complete trios are considered or parent/child pairs are too. + */ + public MendelianViolation(double minGenotypeQualityP, boolean abortOnSampleNotFound, boolean completeTriosOnly) { + minGenotypeQuality = minGenotypeQualityP; + this.abortOnSampleNotFound = abortOnSampleNotFound; + violationFamilies = new ArrayList(); + createInheritanceMap(); + allCalledOnly = completeTriosOnly; + } + + /** + * @param families the families to be checked for Mendelian violations + * @param vc the variant context to extract the genotypes and alleles for mom, dad and child. + * @return whether or not there is a mendelian violation at the site. + */ + public int countViolations(Map> families, VariantContext vc){ + + //Reset counts + nocall = 0; + lowQual = 0; + familyCalled = 0; + varFamilyCalled = 0; + violations_total=0; + violationFamilies.clear(); + clearInheritanceMap(); + + for(Set family : families.values()){ + Iterator sampleIterator = family.iterator(); + Sample sample; + while(sampleIterator.hasNext()){ + sample = sampleIterator.next(); + if(sample.getParents().size() > 0) + updateViolations(sample.getFamilyID(),sample.getMaternalID(), sample.getPaternalID(), sample.getID() ,vc); + } + } + return violations_total; + } + + public boolean isViolation(Sample mother, Sample father, Sample child, VariantContext vc){ + + //Reset counts + nocall = 0; + lowQual = 0; + familyCalled = 0; + varFamilyCalled = 0; + violations_total=0; + violationFamilies.clear(); + clearInheritanceMap(); + updateViolations(mother.getFamilyID(),mother.getID(),father.getID(),child.getID(),vc); + return violations_total>0; + } + + + private void updateViolations(String familyId, String motherId, String fatherId, String childId, VariantContext vc){ + + int count; + Genotype gMom = vc.getGenotype(motherId); + Genotype gDad = vc.getGenotype(fatherId); + Genotype gChild = vc.getGenotype(childId); + + if (gMom == null || gDad == null || gChild == null){ + if(abortOnSampleNotFound) + throw new IllegalArgumentException(String.format("Variant %s:%d: Missing genotypes for family %s: mom=%s dad=%s family=%s", vc.getChr(), vc.getStart(), familyId, motherId, fatherId, childId)); + else + return; + } + //Count No calls + if(allCalledOnly && (!gMom.isCalled() || !gDad.isCalled() || !gChild.isCalled())){ + nocall++; + } + else if (!gMom.isCalled() && !gDad.isCalled() || !gChild.isCalled()){ + nocall++; + } + //Count lowQual. Note that if min quality is set to 0, even values with no quality associated are returned + else if (minGenotypeQuality>0 && (gMom.getPhredScaledQual() < minGenotypeQuality || + gDad.getPhredScaledQual() < minGenotypeQuality || + gChild.getPhredScaledQual() < minGenotypeQuality )) { + lowQual++; + } + else{ + //Count all families per loci called + familyCalled++; + //If the family is all homref, not too interesting + if(!(gMom.isHomRef() && gDad.isHomRef() && gChild.isHomRef())) + { + varFamilyCalled++; + if(isViolation(gMom, gDad, gChild)){ + violationFamilies.add(familyId); + violations_total++; + } + } + count = inheritance.get(gMom.getType()).get(gDad.getType()).get(gChild.getType()); + inheritance.get(gMom.getType()).get(gDad.getType()).put(gChild.getType(),count+1); + + } + } + + /** + * Evaluate the genotypes of mom, dad, and child to detect Mendelian violations + * + * @param gMom + * @param gDad + * @param gChild + * @return true if the three genotypes represent a Mendelian violation; false otherwise + */ + public static boolean isViolation(final Genotype gMom, final Genotype gDad, final Genotype gChild) { + //1 parent is no "call + if(!gMom.isCalled()){ + return (gDad.isHomRef() && gChild.isHomVar()) || (gDad.isHomVar() && gChild.isHomRef()); + } + else if(!gDad.isCalled()){ + return (gMom.isHomRef() && gChild.isHomVar()) || (gMom.isHomVar() && gChild.isHomRef()); + } + //Both parents have genotype information + return !(gMom.getAlleles().contains(gChild.getAlleles().get(0)) && gDad.getAlleles().contains(gChild.getAlleles().get(1)) || + gMom.getAlleles().contains(gChild.getAlleles().get(1)) && gDad.getAlleles().contains(gChild.getAlleles().get(0))); + } + + private void createInheritanceMap(){ + + inheritance = new EnumMap>>(GenotypeType.class); + for(GenotypeType mType : GenotypeType.values()){ + inheritance.put(mType, new EnumMap>(GenotypeType.class)); + for(GenotypeType dType : GenotypeType.values()){ + inheritance.get(mType).put(dType, new EnumMap(GenotypeType.class)); + for(GenotypeType cType : GenotypeType.values()){ + inheritance.get(mType).get(dType).put(cType, 0); + } + } + } + + } + + private void clearInheritanceMap(){ + for(GenotypeType mType : GenotypeType.values()){ + for(GenotypeType dType : GenotypeType.values()){ + for(GenotypeType cType : GenotypeType.values()){ + inheritance.get(mType).get(dType).put(cType, 0); + } + } + } + } + + /** + * @return the likelihood ratio for a mendelian violation + */ + public double violationLikelihoodRatio(VariantContext vc, String motherId, String fatherId, String childId) { + double[] logLikAssignments = new double[27]; + // the matrix to set up is + // MOM DAD CHILD + // |- AA + // AA AA | AB + // |- BB + // |- AA + // AA AB | AB + // |- BB + // etc. The leaves are counted as 0-11 for MVs and 0-14 for non-MVs + double[] momGL = vc.getGenotype(motherId).getLikelihoods().getAsVector(); + double[] dadGL = vc.getGenotype(fatherId).getLikelihoods().getAsVector(); + double[] childGL = vc.getGenotype(childId).getLikelihoods().getAsVector(); + int offset = 0; + for ( int oMom = 0; oMom < 3; oMom++ ) { + for ( int oDad = 0; oDad < 3; oDad++ ) { + for ( int oChild = 0; oChild < 3; oChild ++ ) { + logLikAssignments[offset++] = momGL[oMom] + dadGL[oDad] + childGL[oChild]; + } + } + } + double[] mvLiks = new double[12]; + double[] nonMVLiks = new double[15]; + for ( int i = 0; i < 12; i ++ ) { + mvLiks[i] = logLikAssignments[mvOffsets[i]]; + } + + for ( int i = 0; i < 15; i++) { + nonMVLiks[i] = logLikAssignments[nonMVOffsets[i]]; + } + + return MathUtils.log10sumLog10(mvLiks) - MathUtils.log10sumLog10(nonMVLiks); + } + +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/PedReader.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/PedReader.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/PedReader.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/PedReader.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/PedigreeValidationType.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/PedigreeValidationType.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/PedigreeValidationType.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/PedigreeValidationType.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/Sample.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/Sample.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/Sample.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/Sample.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/SampleDB.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/SampleDB.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/SampleDB.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/SampleDB.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/SampleDBBuilder.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/SampleDBBuilder.java new file mode 100644 index 000000000..2744bec61 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/SampleDBBuilder.java @@ -0,0 +1,161 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.samples; + +import htsjdk.samtools.SAMFileHeader; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.sam.ReadUtils; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + +/** + * + */ +public class SampleDBBuilder { + PedigreeValidationType validationStrictness; + final SampleDB sampleDB = new SampleDB(); + final GenomeAnalysisEngine engine; + + Set samplesFromDataSources = new HashSet(); + Set samplesFromPedigrees = new HashSet(); + + /** for testing only */ + protected SampleDBBuilder(PedigreeValidationType validationStrictness) { + engine = null; + this.validationStrictness = validationStrictness; + } + + /** + * Constructor takes both a SAM header and sample files because the two must be integrated. + */ + public SampleDBBuilder(GenomeAnalysisEngine engine, PedigreeValidationType validationStrictness) { + this.engine = engine; + this.validationStrictness = validationStrictness; + } + + /** + * Hallucinates sample objects for all the samples in the SAM file and stores them + */ + public SampleDBBuilder addSamplesFromSAMHeader(final SAMFileHeader header) { + addSamplesFromSampleNames(ReadUtils.getSAMFileSamples(header)); + return this; + } + + public SampleDBBuilder addSamplesFromSampleNames(final Collection sampleNames) { + for (final String sampleName : sampleNames) { + if (sampleDB.getSample(sampleName) == null) { + final Sample newSample = new Sample(sampleName, sampleDB); + sampleDB.addSample(newSample); + samplesFromDataSources.add(newSample); // keep track of data source samples + } + } + return this; + } + + public SampleDBBuilder addSamplesFromPedigreeFiles(final List pedigreeFiles) { + for (final File pedFile : pedigreeFiles) { + Collection samples = addSamplesFromPedigreeArgument(pedFile); + samplesFromPedigrees.addAll(samples); + } + + return this; + } + + public SampleDBBuilder addSamplesFromPedigreeStrings(final List pedigreeStrings) { + for (final String pedString : pedigreeStrings) { + Collection samples = addSamplesFromPedigreeArgument(pedString); + samplesFromPedigrees.addAll(samples); + } + + return this; + } + + /** + * Parse one sample file and integrate it with samples that are already there + * Fail quickly if we find any errors in the file + */ + private Collection addSamplesFromPedigreeArgument(File sampleFile) { + final PedReader reader = new PedReader(); + + try { + return reader.parse(sampleFile, getMissingFields(sampleFile), sampleDB); + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(sampleFile, e); + } + } + + private Collection addSamplesFromPedigreeArgument(final String string) { + final PedReader reader = new PedReader(); + return reader.parse(string, getMissingFields(string), sampleDB); + } + + public SampleDB getFinalSampleDB() { + validate(); + return sampleDB; + } + + public EnumSet getMissingFields(final Object engineArg) { + if ( engine == null ) + return EnumSet.noneOf(PedReader.MissingPedField.class); + else { + final List posTags = engine.getTags(engineArg).getPositionalTags(); + return PedReader.parseMissingFieldTags(engineArg, posTags); + } + } + + // -------------------------------------------------------------------------------- + // + // Validation + // + // -------------------------------------------------------------------------------- + + protected final void validate() { + validatePedigreeIDUniqueness(); + if ( validationStrictness != PedigreeValidationType.SILENT ) { + // check that samples in data sources are all annotated, if anything is annotated + if ( ! samplesFromPedigrees.isEmpty() && ! samplesFromDataSources.isEmpty() ) { + final Set sampleNamesFromPedigrees = new HashSet(); + for ( final Sample pSample : samplesFromPedigrees ) + sampleNamesFromPedigrees.add(pSample.getID()); + + for ( final Sample dsSample : samplesFromDataSources ) + if ( ! sampleNamesFromPedigrees.contains(dsSample.getID()) ) + throw new UserException("Sample " + dsSample.getID() + " found in data sources but not in pedigree files with STRICT pedigree validation"); + } + } + } + + private void validatePedigreeIDUniqueness() { + Set pedigreeIDs = new HashSet(); + for ( Sample sample : samplesFromPedigrees ) { + pedigreeIDs.add(sample.getID()); + } + assert pedigreeIDs.size() == samplesFromPedigrees.size() : "The number of sample IDs extracted from the pedigree does not equal the number of samples in the pedigree. Is a sample associated with multiple families?"; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/Trio.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/Trio.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/Trio.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/Trio.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/ArtificialReadsTraversal.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/ArtificialReadsTraversal.java new file mode 100644 index 000000000..ac34b7594 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/ArtificialReadsTraversal.java @@ -0,0 +1,142 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.traversals; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.engine.datasources.providers.ShardDataProvider; +import org.broadinstitute.gatk.engine.walkers.ReadWalker; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.sam.ArtificialPatternedSAMIterator; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @author aaron + * + * this class acts as a fake reads traversal engine for testing out reads based traversals. + */ +public class ArtificialReadsTraversal extends TraversalEngine,ShardDataProvider> { + + public int startingChr = 1; + public int endingChr = 5; + public int readsPerChr = 100; + public int unMappedReads = 1000; + private int DEFAULT_READ_LENGTH = ArtificialSAMUtils.DEFAULT_READ_LENGTH; + private ArtificialPatternedSAMIterator iter; + /** our log, which we want to capture anything from this class */ + protected static Logger logger = Logger.getLogger(ArtificialReadsTraversal.class); + + /** Creates a new, uninitialized ArtificialReadsTraversal */ + public ArtificialReadsTraversal() { + } + + // what read ordering are we using + private ArtificialPatternedSAMIterator.PATTERN readOrder = ArtificialPatternedSAMIterator.PATTERN.IN_ORDER_READS; + + + /** + * set the read ordering of the reads given to the walker + * + * @param readOrdering + */ + public void setReadOrder( ArtificialPatternedSAMIterator.PATTERN readOrdering ) { + readOrder = readOrdering; + } + + @Override + public String getTraversalUnits() { + return "reads"; + } + + /** + * Traverse by reads, given the data and the walker + * + * @param walker the walker to traverse with + * @param dataProvider the provider of the reads data + * @param sum the value of type T, specified by the walker, to feed to the walkers reduce function + * + * @return the reduce variable of the read walker + */ + public T traverse( Walker walker, + ShardDataProvider dataProvider, + T sum ) { + + if (!( walker instanceof ReadWalker )) + throw new IllegalArgumentException("Walker isn't a read walker!"); + + ReadWalker readWalker = (ReadWalker) walker; + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readsPerChr + DEFAULT_READ_LENGTH); + iter = new ArtificialPatternedSAMIterator(this.startingChr, + this.endingChr, + this.readsPerChr, + this.unMappedReads, + header, + this.readOrder); + + // while we still have more reads + for (SAMRecord read : iter) { + + // an array of characters that represent the reference + ReferenceContext refSeq = null; + + final boolean keepMeP = readWalker.filter(refSeq, (GATKSAMRecord) read); + if (keepMeP) { + M x = readWalker.map(refSeq, (GATKSAMRecord) read, null); // TODO: fix me at some point, it would be nice to fake out ROD data too + sum = readWalker.reduce(x, sum); + } + } + return sum; + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCache.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCache.java new file mode 100644 index 000000000..f84824a59 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCache.java @@ -0,0 +1,168 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.traversals; + +import org.broadinstitute.gatk.utils.downsampling.Downsampler; +import org.broadinstitute.gatk.utils.downsampling.ReservoirDownsampler; +import org.broadinstitute.gatk.utils.sam.AlignmentStartComparator; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * Subsystem to track a list of all reads currently live in the TraverseActiveRegions system, + * while limiting the total number of reads to a maximum capacity. + * + * User: depristo + * Date: 4/7/13 + * Time: 11:23 AM + */ +public class TAROrderedReadCache { + private final int maxCapacity; + private ArrayList undownsampledCache; + private Downsampler downsampler; + + private static final int UNDOWNSAMPLED_CACHE_MAX_INITIAL_SIZE = 10000; + + /** + * Create a new empty ReadCache + * @param maxCapacity the max capacity of the read cache. + */ + public TAROrderedReadCache( final int maxCapacity ) { + if ( maxCapacity < 0 ) throw new IllegalArgumentException("maxCapacity must be >= 0 but got " + maxCapacity); + this.maxCapacity = maxCapacity; + + // The one we're not currently using will always be null: + initializeUndownsampledCache(); + this.downsampler = null; + } + + /** + * Moves all reads over to the downsampler, causing it to be used from this point on. Should be called + * when the undownsampledCache fills up and we need to start discarding reads. Since the + * ReservoirDownsampler doesn't preserve relative ordering, pop operations become expensive + * after this point, as they require a O(n log n) sort. + */ + private void activateDownsampler() { + downsampler = new ReservoirDownsampler<>(maxCapacity, false); + downsampler.submit(undownsampledCache); + undownsampledCache = null; // preferable to the O(n) clear() method + } + + /** + * Allocate the undownsampled cache used when we have fewer than maxCapacity items + */ + private void initializeUndownsampledCache() { + undownsampledCache = new ArrayList<>(Math.min(maxCapacity + 1, UNDOWNSAMPLED_CACHE_MAX_INITIAL_SIZE)); + } + + /** + * What's the maximum number of reads we'll store in the cache? + * @return a positive integer + */ + public int getMaxCapacity() { + return maxCapacity; + } + + /** + * Add a single read to this cache. Assumed to be in sorted order w.r.t. the previously added reads + * @param read a read to add + */ + public void add( final GATKSAMRecord read ) { + if ( read == null ) throw new IllegalArgumentException("Read cannot be null"); + + if ( downsampler != null ) { + downsampler.submit(read); + } + else { + undownsampledCache.add(read); + + // No more room in the undownsampledCache? Time to start downsampling + if ( undownsampledCache.size() > maxCapacity ) { + activateDownsampler(); + } + } + } + + /** + * Add a collection of reads to this cache. Assumed to be in sorted order w.r.t. the previously added reads and each other + * @param reads a collection of reads to add + */ + public void addAll( final List reads ) { + if ( reads == null ) throw new IllegalArgumentException("Reads cannot be null"); + for ( final GATKSAMRecord read : reads ) { + add(read); + } + } + + /** + * How many reads are currently in the cache? + * @return a positive integer + */ + public int size() { + return downsampler != null ? downsampler.size() : undownsampledCache.size(); + } + + /** + * How many reads were discarded since the last call to popCurrentReads + * + * @return number of items discarded during downsampling since last pop operation + */ + public int getNumDiscarded() { + return downsampler != null ? downsampler.getNumberOfDiscardedItems() : 0; + } + + /** + * Removes all reads currently in the cache, and returns them in sorted order (w.r.t. alignmentStart) + * + * Flushes this cache, so after this call the cache will contain no reads, and we'll be in the same + * initial state as the constructor would put us in, with a non-null undownsampledCache and a null + * downsampler. + * + * @return a list of GATKSAMRecords in this cache + */ + public List popCurrentReads() { + final List poppedReads; + + if ( downsampler == null ) { + poppedReads = undownsampledCache; // avoid making a copy here, since we're going to allocate a new cache + } + else { + // If we triggered the downsampler, we need to sort the reads before returning them, + // since the ReservoirDownsampler is not guaranteed to preserve relative ordering of items. + // After consuming the downsampled items in this call to popCurrentReads(), we switch back + // to using the undownsampledCache until we fill up again. + poppedReads = downsampler.consumeFinalizedItems(); // avoid making a copy here + Collections.sort(poppedReads, new AlignmentStartComparator()); + downsampler = null; + } + + initializeUndownsampledCache(); + return poppedReads; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraversalEngine.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraversalEngine.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraversalEngine.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraversalEngine.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegions.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegions.java new file mode 100644 index 000000000..b3a0603f4 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegions.java @@ -0,0 +1,719 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.traversals; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.engine.datasources.providers.*; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.engine.walkers.ActiveRegionTraversalParameters; +import org.broadinstitute.gatk.engine.walkers.ActiveRegionWalker; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; +import org.broadinstitute.gatk.utils.activeregion.ActivityProfile; +import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState; +import org.broadinstitute.gatk.utils.activeregion.BandPassActivityProfile; +import org.broadinstitute.gatk.utils.nanoScheduler.NSMapFunction; +import org.broadinstitute.gatk.utils.nanoScheduler.NSProgressFunction; +import org.broadinstitute.gatk.utils.nanoScheduler.NSReduceFunction; +import org.broadinstitute.gatk.utils.nanoScheduler.NanoScheduler; +import org.broadinstitute.gatk.utils.progressmeter.ProgressMeter; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.ReadUtils; + +import java.io.PrintStream; +import java.util.Collection; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +/** + * Implement active region traversal + * + * User: depristo + * Date: 1/9/13 + * Time: 4:45 PM + * + * Live region: + * + * The ART tracks a thing called the live region. The live region is a position on a specific contig + * of the alignment start of the last read we processed during this traversal. Because the + * read stream is sorted, future reads must occurs in the the live region. Therefore the the dead region + * (everything to the left of the live boundary) cannot have any more read data. The live / dead + * regions are used to decide when we can safely call map on active regions, as only active regions + * contained completely within the dead region (including extensions) have a complete set of read data + * in the collected read list. All of the data related to the live region is captured by the local + * variable spanOfLastReadSeen + * + */ +public final class TraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { + private final static boolean DEBUG = false; + protected final static Logger logger = Logger.getLogger(TraversalEngine.class); + protected final static boolean LOG_READ_CARRYING = false; + + // set by the traversal + private boolean walkerHasPresetRegions = false; + private int activeRegionExtension = -1; + private int maxRegionSize = -1; + private int minRegionSize = -1; + + private final LinkedList workQueue = new LinkedList<>(); + + private TAROrderedReadCache myReads = null; + + private GenomeLoc lastRegionProcessed = null; + private GenomeLoc spanOfLastReadSeen = null; + private ActivityProfile activityProfile = null; + int maxReadsInMemory = 0; + ActiveRegionWalker walker; + + final NanoScheduler nanoScheduler; + + /** + * Data to use in the ActiveRegionWalker.map function produced by the NanoScheduler input iterator + */ + private static class MapData { + public ActiveRegion activeRegion; + public RefMetaDataTracker tracker; + + private MapData(ActiveRegion activeRegion, RefMetaDataTracker tracker) { + this.activeRegion = activeRegion; + this.tracker = tracker; + } + } + + /** + * Create a single threaded active region traverser + */ + public TraverseActiveRegions() { + this(1); + } + + /** + * Create an active region traverser that uses nThreads for getting its work done + * @param nThreads number of threads + */ + public TraverseActiveRegions(final int nThreads) { + nanoScheduler = new NanoScheduler<>(nThreads); + nanoScheduler.setProgressFunction(new NSProgressFunction() { + @Override + public void progress(MapData lastActiveRegion) { + if ( lastActiveRegion != null ) + // note, need to use getStopLocation so we don't give an interval to ProgressMeterDaemon + printProgress(lastActiveRegion.activeRegion.getLocation().getStopLocation()); + } + }); + } + + /** + * Have the debugging output streams been initialized already? + * + * We have to do lazy initialization because when the initialize() function is called + * the streams aren't yet initialized in the GATK walker. + */ + private boolean streamsInitialized = false; + + @Override + public void initialize(GenomeAnalysisEngine engine, Walker walker, ProgressMeter progressMeter) { + super.initialize(engine, walker, progressMeter); + + this.walker = (ActiveRegionWalker)walker; + if ( this.walker.wantsExtendedReads() && ! this.walker.wantsNonPrimaryReads() ) { + throw new IllegalArgumentException("Active region walker " + this.walker + " requested extended events but not " + + "non-primary reads, an inconsistent state. Please modify the walker"); + } + + ActiveRegionTraversalParameters annotation = walker.getClass().getAnnotation(ActiveRegionTraversalParameters.class); + this.activeRegionExtension = this.walker.activeRegionExtension == null ? annotation.extension() : this.walker.activeRegionExtension; + this.maxRegionSize = this.walker.activeRegionMaxSize == null ? annotation.maxRegion() : this.walker.activeRegionMaxSize; + this.minRegionSize = annotation.minRegion(); + final double bandPassSigma = this.walker.bandPassSigma == null ? annotation.bandPassSigma() : this.walker.bandPassSigma; + walkerHasPresetRegions = this.walker.hasPresetActiveRegions(); + + activityProfile = new BandPassActivityProfile(engine.getGenomeLocParser(), engine.getIntervals(), this.walker.maxProbPropagationDistance, this.walker.activeProbThreshold, + BandPassActivityProfile.MAX_FILTER_SIZE, bandPassSigma); + + final int maxReadsAcrossSamples = annotation.maxReadsToHoldInMemoryPerSample() * ReadUtils.getSAMFileSamples(engine.getSAMFileHeader()).size(); + final int maxReadsToHoldInMemory = Math.min(maxReadsAcrossSamples, annotation.maxReadsToHoldTotal()); + myReads = new TAROrderedReadCache(maxReadsToHoldInMemory); + } + + // ------------------------------------------------------------------------------------- + // + // Utility functions + // + // ------------------------------------------------------------------------------------- + + /** + * Load in the preset regions for contig into workQueue + * + * Should be called before starting to process work on contig + * + * Can only be called when walkerHasPresetRegions is true or an IllegalStateException will be thrown + * + * @param contig the contig we are about to process + */ + protected void loadPresetRegionsForContigToWorkQueue(final String contig) { + if ( ! walkerHasPresetRegions ) throw new IllegalStateException("only appropriate to call when walker has preset regions"); + + final GenomeLoc contigSpan = engine.getGenomeLocParser().createOverEntireContig(contig); + for ( final GenomeLoc loc : this.walker.getPresetActiveRegions().getOverlapping(contigSpan) ) { + workQueue.add(new ActiveRegion(loc, null, true, engine.getGenomeLocParser(), getActiveRegionExtension())); + } + } + + protected int getActiveRegionExtension() { + return activeRegionExtension; + } + + protected int getMaxRegionSize() { + return maxRegionSize; + } + + protected int getMinRegionSize() { + return minRegionSize; + } + + @Override + public String getTraversalUnits() { + return "active regions"; + } + + @Override + public String toString() { + return "TraverseActiveRegions"; + } + + /** + * Is the loc outside of the intervals being requested for processing by the GATK? + * @param loc + * @return + */ + protected boolean outsideEngineIntervals(final GenomeLoc loc) { + return engine.getIntervals() != null && ! engine.getIntervals().overlaps(loc); + } + + // ------------------------------------------------------------------------------------- + // + // Actual traverse function + // + // ------------------------------------------------------------------------------------- + + /** + * Did read appear in the last shard? + * + * When we transition across shard boundaries we see duplicate reads because + * each shard contains the reads that *overlap* the shard. So if we just finished + * shard 1-1000 and are now in 1001-2000 we'll see duplicate reads from 1001 + * that overlapped 1-1000. This function tests read to determine if we would have + * seen it before by asking if read.getAlignmentStart() is less than the + * stop position of the last seen read at the start of the traversal. The reason + * we need to use the location of the last read at the start of the traversal + * is that we update the lastRead during the traversal, and we only want to filter + * out reads whose start is before the last read of the previous shard, not the + * current shard. + * + * @param locOfLastReadAtTraversalStart the location of the last read seen at the start of the traversal + * @param read the read we want to test if it's already been seen in the last shard + * @return true if read would have appeared in the last shard, false otherwise + */ + @Requires({"read != null"}) + private boolean appearedInLastShard(final GenomeLoc locOfLastReadAtTraversalStart, final GATKSAMRecord read) { + if ( locOfLastReadAtTraversalStart == null ) + // we're in the first shard, so obviously the answer is no + return false; + else { + // otherwise check to see if the alignment occurred in the previous shard + return read.getAlignmentStart() <= locOfLastReadAtTraversalStart.getStart() + // we're on the same contig + && read.getReferenceIndex() == locOfLastReadAtTraversalStart.getContigIndex(); + } + + } + + @Override + public T traverse( final ActiveRegionWalker walker, + final LocusShardDataProvider dataProvider, + T sum) { + if ( LOG_READ_CARRYING || logger.isDebugEnabled() ) + logger.info(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider)); + + nanoScheduler.setDebug(false); + final Iterator activeRegionIterator = new ActiveRegionIterator(dataProvider); + final TraverseActiveRegionMap myMap = new TraverseActiveRegionMap(); + final TraverseActiveRegionReduce myReduce = new TraverseActiveRegionReduce(); + final T result = nanoScheduler.execute(activeRegionIterator, myMap, sum, myReduce); + + return result; + } + + private class ActiveRegionIterator implements Iterator { + private final LocusShardDataProvider dataProvider; + private LinkedList readyActiveRegions = new LinkedList<>(); + private boolean done = false; + private final LocusView locusView; + private final LocusReferenceView referenceView; + private final GenomeLoc locOfLastReadAtTraversalStart; + private final IntervalReferenceOrderedView referenceOrderedDataView; + private final GenomeLoc currentWindow; + private final boolean processRemainingActiveRegions; + + public ActiveRegionIterator( final LocusShardDataProvider dataProvider ) { + this.dataProvider = dataProvider; + locusView = new AllLocusView(dataProvider); + referenceView = new LocusReferenceView( walker, dataProvider ); + + // The data shard may carry a number of locations to process (due to being indexed together). + // This value is just the interval we are processing within the entire provider + currentWindow = dataProvider.getLocus(); + final int currentWindowPos = dataProvider.getShard().getGenomeLocs().indexOf(currentWindow); + if ( currentWindowPos == -1 ) throw new IllegalStateException("Data provider " + dataProvider + " didn't have our current window in it " + currentWindow); + processRemainingActiveRegions = currentWindowPos == dataProvider.getShard().getGenomeLocs().size() - 1; + + // the rodSpan covers all of the bases in the activity profile, including all of the bases + // through the current window interval. This is because we may issue a query to get data for an + // active region spanning before the current interval as far back as the start of the current profile, + // if we have pending work to do that finalizes in this interval. + final GenomeLoc rodSpan = activityProfile.getSpan() == null ? currentWindow : activityProfile.getSpan().endpointSpan(currentWindow); + if ( ! dataProvider.getShard().getLocation().containsP(rodSpan) ) throw new IllegalStateException("Rod span " + rodSpan + " isn't contained within the data shard " + dataProvider.getShard().getLocation() + ", meaning we wouldn't get all of the data we need"); + referenceOrderedDataView = new IntervalReferenceOrderedView( dataProvider, rodSpan ); + + // We keep processing while the next reference location is within the interval + locOfLastReadAtTraversalStart = spanOfLastSeenRead(); + + // load in the workQueue the present regions that span the current contig, if it's different from the last one + if ( walkerHasPresetRegions && ( lastRegionProcessed == null || ! currentWindow.onSameContig(lastRegionProcessed)) ) { + loadPresetRegionsForContigToWorkQueue(currentWindow.getContig()); + } + + // remember the last region we processed for sanity checking later + lastRegionProcessed = currentWindow; + } + + @Override public void remove() { throw new UnsupportedOperationException("Cannot remove from ActiveRegionIterator"); } + + @Override + public MapData next() { + return readyActiveRegions.pop(); + } + @Override + public boolean hasNext() { + if ( engine.exceedsRuntimeLimit() ) // too much time has been dedicated to doing work, just stop + return false; + if ( ! readyActiveRegions.isEmpty() ) + return true; + if ( done ) + return false; + else { + + while( locusView.hasNext() ) { + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + rememberLastLocusLocation(location); + + // get all of the new reads that appear in the current pileup, and them to our list of reads + // provided we haven't seen them before + final Collection reads = locusView.getLIBS().transferReadsFromAllPreviousPileups(); + for( final GATKSAMRecord read : reads ) { + // note that ActiveRegionShards span entire contigs, so this check is in some + // sense no longer necessary, as any read that appeared in the last shard would now + // by definition be on a different contig. However, the logic here doesn't hurt anything + // and makes us robust should we decided to provide shards that don't fully span + // contigs at some point in the future + if ( ! appearedInLastShard(locOfLastReadAtTraversalStart, read) ) { + rememberLastReadLocation(read); + myReads.add(read); + } + } + + // skip this location -- it's not part of our engine intervals + if ( outsideEngineIntervals(location) ) + continue; + + // we've move across some interval boundary, restart profile + final boolean flushProfile = ! activityProfile.isEmpty() + && ( activityProfile.getContigIndex() != location.getContigIndex() + || location.getStart() != activityProfile.getStop() + 1); + final List newActiveRegions = prepActiveRegionsForProcessing(walker, flushProfile, false, referenceOrderedDataView); + + dataProvider.getShard().getReadMetrics().incrementNumIterations(); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation()); + + // Call the walkers isActive function for this locus and add them to the list to be integrated later + addIsActiveResult(walker, tracker, refContext, locus); + + maxReadsInMemory = Math.max(myReads.size(), maxReadsInMemory); + printProgress(location); + + if ( ! newActiveRegions.isEmpty() ) { + readyActiveRegions.addAll(newActiveRegions); + if ( DEBUG ) + for ( final MapData region : newActiveRegions ) + logger.info("Adding region to queue for processing " + region.activeRegion); + return true; + } + } + + if ( processRemainingActiveRegions ) { + // we've run out of stuff to process, and since shards now span entire contig boundaries + // we should finalized our regions. This allows us to continue to use our referenceOrderedDataView + // which would otherwise be shutdown. Only followed when the microschedule says that we're + // inside of the last window in the current shard + readyActiveRegions.addAll(prepActiveRegionsForProcessing(walker, true, true, referenceOrderedDataView)); + } + + return ! readyActiveRegions.isEmpty(); + } + } + } + + // ------------------------------------------------------------------------------------- + // + // Functions to manage and interact with the live / dead zone + // + // ------------------------------------------------------------------------------------- + + /** + * Update the live region to reflect that the last read we've seen in the traversal is read + * + * Requires that sequential calls always be provided reads in coordinate sorted order + * + * @param read the last read we've seen during the traversal + */ + @Requires({"read != null"}) + protected void rememberLastReadLocation(final GATKSAMRecord read) { + final GenomeLoc currentLocation = engine.getGenomeLocParser().createGenomeLoc(read); + if ( spanOfLastReadSeen == null ) + spanOfLastReadSeen = currentLocation; + else { + if ( currentLocation.isBefore(spanOfLastReadSeen) ) + throw new IllegalStateException("Updating last read seen in the traversal with read " + read + " with span " + currentLocation + " but this occurs before the previously seen read " + spanOfLastReadSeen); + spanOfLastReadSeen = currentLocation; + } + } + + /** + * Update the live region to reflect that we've reached locus + * + * This function is complementary to #rememberLastReadLocation, but if we don't have any reads for a long + * time (e.g., there's no coverage) we will keep active regions around far longer than necessary. + * + * Only updates the span if it's beyond the last seen + * + * @param currentLocation the current location we've processed on the genome + */ + protected void rememberLastLocusLocation(final GenomeLoc currentLocation) { + if ( spanOfLastReadSeen == null ) + spanOfLastReadSeen = currentLocation; + else { + if ( currentLocation.isPast(spanOfLastReadSeen) ) + spanOfLastReadSeen = currentLocation; + } + } + + + /** + * Get a GenomeLoc indicating the start (heading to the right) of the live ART region. + * @return the left-most position of the live region on the genome + */ + protected GenomeLoc spanOfLastSeenRead() { + return spanOfLastReadSeen; + } + + /** + * Is the active region completely within the traversal's dead zone? + * + * @param region the region we want to test + * @return true if the extended location of region is completely within the current dead zone, false otherwise + */ + protected boolean regionCompletelyWithinDeadZone(final ActiveRegion region) { + if ( spanOfLastSeenRead() == null ) + return false; + + final int contigCmp = region.getExtendedLoc().compareContigs(spanOfLastSeenRead()); + if ( contigCmp > 0 ) + throw new IllegalStateException("Active region " + region + " on a contig after last seen read " + spanOfLastSeenRead()); + else { + return contigCmp < 0 || region.getExtendedLoc().getStop() < spanOfLastSeenRead().getStart(); + } + } + + /** + * Is the read dead? That is, can it no longer be in any future active region, and therefore can be discarded? + * + * read: start |--------> stop ------ stop + extension + * region: start |-----------------| end + * + * Since the regions are coming in order, read could potentially be contained in a future interval if + * stop + activeRegionExtension >= end. If, on the other hand, stop + extension is < the end + * of this region, then we can discard it, since any future region could only include reads + * up to end + 1 - extension. + * + * Note that this function doesn't care about the dead zone. We're assuming that by + * actually calling this function with an active region that region is already in the dead zone, + * so checking that the read is in the dead zone doesn't make sense. + * + * @param read the read we're testing + * @param activeRegion the current active region + * @return true if the read is dead, false other + */ + @Requires({"read != null", "activeRegion != null"}) + private boolean readCannotOccurInAnyMoreActiveRegions(final GATKSAMRecord read, final ActiveRegion activeRegion) { + return read.getReferenceIndex() < activeRegion.getLocation().getContigIndex() || + ( read.getReferenceIndex() == activeRegion.getLocation().getContigIndex() + && read.getAlignmentEnd() + getActiveRegionExtension() < activeRegion.getLocation().getStop() ); + } + + // ------------------------------------------------------------------------------------- + // + // Functions to write out activity profiles and active regions + // + // ------------------------------------------------------------------------------------- + + /** + * Initialize the debugging output streams (activity profile and active regions), if not done so already + */ + @Ensures("streamsInitialized == true") + private void initializeOutputStreamsIfNecessary() { + if ( ! streamsInitialized ) { + streamsInitialized = true; + if ( walker.activityProfileOutStream != null ) { + printIGVFormatHeader(walker.activityProfileOutStream, "line", "ActivityProfile"); + } + + if ( walker.activeRegionOutStream != null ) { + printIGVFormatHeader(walker.activeRegionOutStream, "line", "ActiveRegions"); + } + } + } + + /** + * Helper function to write out a IGV formatted line to out, at loc, with values + * + * http://www.broadinstitute.org/software/igv/IGV + * + * @param out a non-null PrintStream where we'll write our line + * @param graphType the type of graph to show in IGV for this track + * @param columns the column names for this IGV track + */ + @Requires({ + "out != null", + "graphType != null", + "columns.length > 0" + }) + private void printIGVFormatHeader(final PrintStream out, final String graphType, final String ... columns ) { + out.printf("#track graphType=%s%n", graphType); + out.printf("Chromosome\tStart\tEnd\tFeature\t%s%n", Utils.join("\t", columns)); + + } + + /** + * Helper function to write out a IGV formatted line to out, at loc, with values + * + * http://www.broadinstitute.org/software/igv/IGV + * + * @param out a non-null PrintStream where we'll write our line + * @param loc the location of values + * @param featureName string name of this feature (see IGV format) + * @param values the floating point values to associate with loc and feature name in out + */ + @Requires({ + "out != null", + "loc != null", + "values.length > 0" + }) + private void printIGVFormatRow(final PrintStream out, final GenomeLoc loc, final String featureName, final double ... values) { + // note that start and stop are 0 based, but the stop is exclusive so we don't subtract 1 + out.printf("%s\t%d\t%d\t%s", loc.getContig(), loc.getStart() - 1, loc.getStop(), featureName); + for ( final double value : values ) + out.print(String.format("\t%.5f", value)); + out.println(); + } + + /** + * Write out activity profile information, if requested by the walker + * + * @param states the states in the current activity profile + */ + @Requires("states != null") + private void writeActivityProfile(final List states) { + if ( walker.activityProfileOutStream != null ) { + initializeOutputStreamsIfNecessary(); + for ( final ActivityProfileState state : states ) { + printIGVFormatRow(walker.activityProfileOutStream, state.getLoc(), "state", Math.min(state.isActiveProb, 1.0)); + } + } + } + + /** + * Write out each active region to the walker activeRegionOutStream + * + * @param region the region we're currently operating on + */ + @Requires("region != null") + private void writeActiveRegion(final ActiveRegion region) { + if( walker.activeRegionOutStream != null ) { + initializeOutputStreamsIfNecessary(); + printIGVFormatRow(walker.activeRegionOutStream, region.getLocation().getStartLocation(), + "end-marker", 0.0); + printIGVFormatRow(walker.activeRegionOutStream, region.getLocation(), + "size=" + region.getLocation().size(), region.isActive() ? 1.0 : -1.0); + } + } + + + // ------------------------------------------------------------------------------------- + // + // Functions to process active regions that are ready for map / reduce calls + // + // ------------------------------------------------------------------------------------- + + /** + * Invoke the walker isActive function, and incorporate its result into the activity profile + * + * @param walker the walker we're running + * @param tracker the ref meta data tracker to pass on to the isActive function of walker + * @param refContext the refContext to pass on to the isActive function of walker + * @param locus the AlignmentContext to pass on to the isActive function of walker + */ + private void addIsActiveResult(final ActiveRegionWalker walker, + final RefMetaDataTracker tracker, final ReferenceContext refContext, + final AlignmentContext locus) { + // must be called, even if we won't use the result, to satisfy walker contract + final ActivityProfileState state = walker.isActive( tracker, refContext, locus ); + if ( walker.forceActive) state.isActiveProb = 1.0; + if ( ! walkerHasPresetRegions ) { + activityProfile.add(state); + } + } + + /** + * Take the individual isActive calls and integrate them into contiguous active regions and + * add these blocks of work to the work queue + * band-pass filter the list of isActive probabilities and turn into active regions + */ + private List prepActiveRegionsForProcessing(final ActiveRegionWalker walker, + final boolean flushActivityProfile, + final boolean forceAllRegionsToBeActive, + final IntervalReferenceOrderedView referenceOrderedDataView) { + if ( ! walkerHasPresetRegions ) { + // We don't have preset regions, so we get our regions from the activity profile + final Collection activeRegions = activityProfile.popReadyActiveRegions(getActiveRegionExtension(), getMinRegionSize(), getMaxRegionSize(), flushActivityProfile); + workQueue.addAll(activeRegions); + if ( ! activeRegions.isEmpty() && logger.isDebugEnabled() ) logger.debug("Integrated " + activityProfile.size() + " isActive calls into " + activeRegions.size() + " regions." ); + } + + // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them + final LinkedList readyRegions = new LinkedList<>(); + while( workQueue.peek() != null ) { + final ActiveRegion activeRegion = workQueue.peek(); + if ( forceAllRegionsToBeActive || regionCompletelyWithinDeadZone(activeRegion) ) { + writeActivityProfile(activeRegion.getSupportingStates()); + writeActiveRegion(activeRegion); + readyRegions.add(prepActiveRegionForProcessing(workQueue.remove(), walker, referenceOrderedDataView)); + } else { + break; + } + } + + return readyRegions; + + } + + private MapData prepActiveRegionForProcessing(final ActiveRegion activeRegion, + final ActiveRegionWalker walker, + final IntervalReferenceOrderedView referenceOrderedDataView) { + final List stillLive = new LinkedList<>(); + for ( final GATKSAMRecord read : myReads.popCurrentReads() ) { + boolean killed = false; + final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); + + if( activeRegion.getLocation().overlapsP( readLoc ) ) { + activeRegion.add(read); + + if ( ! walker.wantsNonPrimaryReads() ) { + killed = true; + } + } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) { + activeRegion.add( read ); + } + + // if the read hasn't already been killed, check if it cannot occur in any more active regions, and maybe kill it + if ( ! killed && readCannotOccurInAnyMoreActiveRegions(read, activeRegion) ) { + killed = true; + } + + // keep track of all of the still live active regions + if ( ! killed ) stillLive.add(read); + } + myReads.addAll(stillLive); + + if ( logger.isDebugEnabled() ) { + logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive() ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReadSpanLoc()); + } + + if ( LOG_READ_CARRYING ) + logger.info(String.format("Processing region %20s span=%3d active?=%5b with %4d reads. Overall max reads carried is %s", + activeRegion.getLocation(), activeRegion.getLocation().size(), activeRegion.isActive(), activeRegion.size(), maxReadsInMemory)); + + // prepare the RefMetaDataTracker information + final GenomeLoc loc = activeRegion.getLocation(); + // get all of the RODs that cover the active region (without extension) + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataForInterval(loc); + // trim away all of the features that occurred before this location, as we will not need them in the future + referenceOrderedDataView.trimCurrentFeaturesToLoc(loc); + + return new MapData(activeRegion, tracker); + } + + private class TraverseActiveRegionMap implements NSMapFunction { + @Override + public M apply(final MapData mapData) { + if ( DEBUG ) logger.info("Executing walker.map for " + mapData.activeRegion + " in thread " + Thread.currentThread().getName()); + return walker.map(mapData.activeRegion, mapData.tracker); + } + } + + private class TraverseActiveRegionReduce implements NSReduceFunction { + @Override + public T apply(M one, T sum) { + return walker.reduce(one, sum); + } + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseDuplicates.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseDuplicates.java new file mode 100644 index 000000000..a8c88aace --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseDuplicates.java @@ -0,0 +1,205 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.traversals; + +import htsjdk.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.engine.datasources.providers.ReadShardDataProvider; +import org.broadinstitute.gatk.engine.datasources.providers.ReadView; +import org.broadinstitute.gatk.utils.iterators.PushbackIterator; +import org.broadinstitute.gatk.engine.walkers.DuplicateWalker; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.util.*; + +/** + * @author Mark DePristo + * @version 0.1 + *

    + * Class TraverseDuplicates + *

    + * This class handles traversing lists of duplicate reads in the new shardable style + */ +public class TraverseDuplicates extends TraversalEngine,ReadShardDataProvider> { + /** our log, which we want to capture anything from this class */ + protected static Logger logger = Logger.getLogger(TraverseDuplicates.class); + + /** Turn this to true to enable logger.debug output */ + private final boolean DEBUG = false; + + @Override + public String getTraversalUnits() { + return "dups"; + } + + private List readsAtLoc(final GATKSAMRecord read, PushbackIterator iter) { + GenomeLoc site = engine.getGenomeLocParser().createGenomeLoc(read); + ArrayList l = new ArrayList(); + + l.add(read); + for (SAMRecord read2 : iter) { + GenomeLoc site2 = engine.getGenomeLocParser().createGenomeLoc(read2); + + // the next read starts too late + if (site2.getStart() != site.getStart()) { + iter.pushback(read2); + break; + } else { + l.add((GATKSAMRecord) read2); + } + } + + return l; + } + + /** + * Creates a set of lists of reads, where each list contains reads from the same underlying molecule according + * to their duplicate flag and their (and mate, if applicable) start/end positions. + * + * @param reads the list of reads to split into unique molecular samples + * @return + */ + protected Set> uniqueReadSets(List reads) { + Set> readSets = new LinkedHashSet>(); + + // for each read, find duplicates, and either add the read to its duplicate list or start a new one + for ( GATKSAMRecord read : reads ) { + List readSet = findDuplicateReads(read, readSets); + + if ( readSet == null ) { + readSets.add(new ArrayList(Arrays.asList(read))); // copy so I can add to the list + } else { + readSet.add(read); + } + } + + return readSets; + } + + /** + * Find duplicate reads for read in the set of unique reads. This is effective a duplicate marking algorithm, + * but it relies for safety's sake on the file itself being marked by a true duplicate marking algorithm. Pair + * and single-end read aware. + * + * @param read + * @param readSets + * @return The list of duplicate reads that read is a member of, or null if it's the only one of its kind + */ + protected List findDuplicateReads(GATKSAMRecord read, Set> readSets ) { + if ( read.getReadPairedFlag() ) { + // paired + final GenomeLoc readMateLoc = engine.getGenomeLocParser().createGenomeLoc(read.getMateReferenceName(), read.getMateAlignmentStart(), read.getMateAlignmentStart()); + + for (List reads : readSets) { + GATKSAMRecord key = reads.get(0); + + // read and key start at the same place, and either the this read and the key + // share a mate location or the read is flagged as a duplicate + if ( read.getAlignmentStart() == key.getAlignmentStart() && key.getReadPairedFlag() && ( key.getDuplicateReadFlag() || read.getDuplicateReadFlag() ) ) { + // at least one has to be marked as a duplicate + final GenomeLoc keyMateLoc = engine.getGenomeLocParser().createGenomeLoc(key.getMateReferenceName(), key.getMateAlignmentStart(), key.getMateAlignmentStart()); + if ( readMateLoc.compareTo(keyMateLoc) == 0 ) { + // we are at the same position as the dup and have the same mat pos, it's a dup + if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s %d %s vs. %s", read, reads.size(), readMateLoc, keyMateLoc)); + return reads; + } + } + } + } else { + for (List reads : readSets) { + GATKSAMRecord key = reads.get(0); + boolean v = (! key.getReadPairedFlag()) && read.getAlignmentStart() == key.getAlignmentStart() && ( key.getDuplicateReadFlag() || read.getDuplicateReadFlag() ) && read.getReadLength() == key.getReadLength(); + //System.out.printf("%s %s %b %b %d %d %d %d => %b%n", + // read.getReadPairedFlag(), key.getReadPairedFlag(), read.getDuplicateReadFlag(), key.getDuplicateReadFlag(), + // read.getAlignmentStart(), key.getAlignmentStart(), read.getReadLength(), key.getReadLength(), v); + if ( v ) { + //System.out.printf("Returning reads...%n"); + return reads; + } + } + } + + return null; + } + + // -------------------------------------------------------------------------------------------------------------- + // + // new style interface to the system + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Traverse by reads, given the data and the walker + * + * @param walker the walker to execute over + * @param sum of type T, the return from the walker + * + * @return the result type T, the product of all the reduce calls + */ + public T traverse(DuplicateWalker walker, + ReadShardDataProvider dataProvider, + T sum) { + PushbackIterator iter = new PushbackIterator(new ReadView(dataProvider).iterator()); + + /** + * while we still have more reads: + * ok, here's the idea. We get all the reads that start at the same position in the genome + * We then split the list of reads into sublists of reads: + * -> those with the same mate pair position, for paired reads + * -> those flagged as unpaired and duplicated but having the same start and end + */ + boolean done = walker.isDone(); + for (SAMRecord read : iter) { + if ( done ) break; + // get the genome loc from the read + GenomeLoc site = engine.getGenomeLocParser().createGenomeLoc(read); + + Set> readSets = uniqueReadSets(readsAtLoc((GATKSAMRecord) read, iter)); + if ( DEBUG ) logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d read sets", site, readSets.size())); + + // Jump forward in the reference to this locus location + AlignmentContext locus = new AlignmentContext(site, new ReadBackedPileupImpl(site)); + + // update the number of duplicate sets we've seen + dataProvider.getShard().getReadMetrics().incrementNumIterations(); + + // actually call filter and map, accumulating sum + final boolean keepMeP = walker.filter(site, locus, readSets); + if (keepMeP) { + M x = walker.map(site, locus, readSets); + sum = walker.reduce(x, sum); + } + + printProgress(site.getStopLocation()); + done = walker.isDone(); + } + + return sum; + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseLociNano.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseLociNano.java new file mode 100644 index 000000000..1c16c0e19 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseLociNano.java @@ -0,0 +1,304 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.traversals; + +import org.broadinstitute.gatk.engine.WalkerManager; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.engine.datasources.providers.*; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.engine.walkers.DataSource; +import org.broadinstitute.gatk.engine.walkers.LocusWalker; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.nanoScheduler.NSMapFunction; +import org.broadinstitute.gatk.utils.nanoScheduler.NSProgressFunction; +import org.broadinstitute.gatk.utils.nanoScheduler.NSReduceFunction; +import org.broadinstitute.gatk.utils.nanoScheduler.NanoScheduler; +import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; + +import java.util.Iterator; + +/** + * A simple solution to iterating over all reference positions over a series of genomic locations. + */ +public class TraverseLociNano extends TraversalEngine,LocusShardDataProvider> { + /** our log, which we want to capture anything from this class */ + private static final boolean DEBUG = false; + + final NanoScheduler nanoScheduler; + + public TraverseLociNano(int nThreads) { + nanoScheduler = new NanoScheduler(nThreads); + nanoScheduler.setProgressFunction(new TraverseLociProgress()); + } + + @Override + public final String getTraversalUnits() { + return "sites"; + } + + protected static class TraverseResults { + final int numIterations; + final T reduceResult; + + public TraverseResults(int numIterations, T reduceResult) { + this.numIterations = numIterations; + this.reduceResult = reduceResult; + } + } + + @Override + public T traverse( LocusWalker walker, + LocusShardDataProvider dataProvider, + T sum) { + logger.debug(String.format("TraverseLoci.traverse: Shard is %s", dataProvider)); + + final LocusView locusView = getLocusView( walker, dataProvider ); + + if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all + //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); + ReferenceOrderedView referenceOrderedDataView = null; + if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) + referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider ); + else + referenceOrderedDataView = (RodLocusView)locusView; + + final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); + + final TraverseResults result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum ); + sum = result.reduceResult; + dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations); + } + + // We have a final map call to execute here to clean up the skipped based from the + // last position in the ROD to that in the interval + if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) { + // only do this if the walker isn't done! + final RodLocusView rodLocusView = (RodLocusView)locusView; + final long nSkipped = rodLocusView.getLastSkippedBases(); + if ( nSkipped > 0 ) { + final GenomeLoc site = rodLocusView.getLocOneBeyondShard(); + final AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped); + final M x = walker.map(null, null, ac); + sum = walker.reduce(x, sum); + } + } + + return sum; + } + + /** + * Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track' + * of sorts, providing a consistent interface so that TraverseLoci doesn't need to be reimplemented for any new datatype + * that comes along. + * @param walker walker to interrogate. + * @param dataProvider Data which which to drive the locus view. + * @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal. + */ + private LocusView getLocusView( Walker walker, LocusShardDataProvider dataProvider ) { + final DataSource dataSource = WalkerManager.getWalkerDataSource(walker); + if( dataSource == DataSource.READS ) + return new CoveredLocusView(dataProvider); + else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) + return new AllLocusView(dataProvider); + else if( dataSource == DataSource.REFERENCE_ORDERED_DATA ) + return new RodLocusView(dataProvider); + else + throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource); + } + + protected TraverseResults traverse(final LocusWalker walker, + final LocusView locusView, + final LocusReferenceView referenceView, + final ReferenceOrderedView referenceOrderedDataView, + final T sum) { + nanoScheduler.setDebug(DEBUG); + final TraverseLociMap myMap = new TraverseLociMap(walker); + final TraverseLociReduce myReduce = new TraverseLociReduce(walker); + + final MapDataIterator inputIterator = new MapDataIterator(locusView, referenceView, referenceOrderedDataView); + final T result = nanoScheduler.execute(inputIterator, myMap, sum, myReduce); + + return new TraverseResults(inputIterator.numIterations, result); + } + + /** + * Create iterator that provides inputs for all map calls into MapData, to be provided + * to NanoScheduler for Map/Reduce + */ + private class MapDataIterator implements Iterator { + final LocusView locusView; + final LocusReferenceView referenceView; + final ReferenceOrderedView referenceOrderedDataView; + int numIterations = 0; + + private MapDataIterator(LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView) { + this.locusView = locusView; + this.referenceView = referenceView; + this.referenceOrderedDataView = referenceOrderedDataView; + } + + @Override + public boolean hasNext() { + return locusView.hasNext() && ! engine.exceedsRuntimeLimit(); + } + + @Override + public MapData next() { + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + //logger.info("Pulling data from MapDataIterator at " + location); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(location); + + numIterations++; + return new MapData(locus, refContext, tracker); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Cannot remove elements from MapDataIterator"); + } + } + + @Override + public void shutdown() { + nanoScheduler.shutdown(); + } + + /** + * The input data needed for each map call. The read, the reference, and the RODs + */ + private class MapData { + final AlignmentContext alignmentContext; + final ReferenceContext refContext; + final RefMetaDataTracker tracker; + + private MapData(final AlignmentContext alignmentContext, ReferenceContext refContext, RefMetaDataTracker tracker) { + this.alignmentContext = alignmentContext; + this.refContext = refContext; + this.tracker = tracker; + } + + @Override + public String toString() { + return "MapData " + alignmentContext.getLocation(); + } + } + + /** + * Contains the results of a map call, indicating whether the call was good, filtered, or done + */ + private class MapResult { + final M value; + final boolean reduceMe; + + /** + * Create a MapResult with value that should be reduced + * + * @param value the value to reduce + */ + private MapResult(final M value) { + this.value = value; + this.reduceMe = true; + } + + /** + * Create a MapResult that shouldn't be reduced + */ + private MapResult() { + this.value = null; + this.reduceMe = false; + } + } + + /** + * A static object that tells reduce that the result of map should be skipped (filtered or done) + */ + private final MapResult SKIP_REDUCE = new MapResult(); + + /** + * MapFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Applies walker.map to MapData, returning a MapResult object containing the result + */ + private class TraverseLociMap implements NSMapFunction { + final LocusWalker walker; + + private TraverseLociMap(LocusWalker walker) { + this.walker = walker; + } + + @Override + public MapResult apply(final MapData data) { + if ( ! walker.isDone() ) { + final boolean keepMeP = walker.filter(data.tracker, data.refContext, data.alignmentContext); + if (keepMeP) { + final M x = walker.map(data.tracker, data.refContext, data.alignmentContext); + return new MapResult(x); + } + } + return SKIP_REDUCE; + } + } + + /** + * NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable + */ + private class TraverseLociReduce implements NSReduceFunction { + final LocusWalker walker; + + private TraverseLociReduce(LocusWalker walker) { + this.walker = walker; + } + + @Override + public T apply(MapResult one, T sum) { + if ( one.reduceMe ) + // only run reduce on values that aren't DONE or FAILED + return walker.reduce(one.value, sum); + else + return sum; + } + } + + private class TraverseLociProgress implements NSProgressFunction { + @Override + public void progress(MapData lastProcessedMap) { + if (lastProcessedMap.alignmentContext != null) + printProgress(lastProcessedMap.alignmentContext.getLocation()); + } + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseReadPairs.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseReadPairs.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseReadPairs.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseReadPairs.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsNano.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsNano.java new file mode 100644 index 000000000..e392041f0 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsNano.java @@ -0,0 +1,256 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.traversals; + +import htsjdk.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.engine.datasources.providers.ReadBasedReferenceOrderedView; +import org.broadinstitute.gatk.engine.datasources.providers.ReadReferenceView; +import org.broadinstitute.gatk.engine.datasources.providers.ReadShardDataProvider; +import org.broadinstitute.gatk.engine.datasources.providers.ReadView; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.engine.walkers.ReadWalker; +import org.broadinstitute.gatk.utils.nanoScheduler.NSMapFunction; +import org.broadinstitute.gatk.utils.nanoScheduler.NSProgressFunction; +import org.broadinstitute.gatk.utils.nanoScheduler.NSReduceFunction; +import org.broadinstitute.gatk.utils.nanoScheduler.NanoScheduler; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.util.Iterator; +import java.util.LinkedList; + +/** + * A nano-scheduling version of TraverseReads. + * + * Implements the traversal of a walker that accepts individual reads, the reference, and + * RODs per map call. Directly supports shared memory parallelism via NanoScheduler + * + * @author depristo + * @version 1.0 + * @date 9/2/2012 + */ +public class TraverseReadsNano extends TraversalEngine,ReadShardDataProvider> { + /** our log, which we want to capture anything from this class */ + private final static boolean PRE_READ_ALL_MAP_DATA = true; + protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class); + private static final boolean DEBUG = false; + final NanoScheduler nanoScheduler; + + public TraverseReadsNano(int nThreads) { + nanoScheduler = new NanoScheduler(nThreads); + nanoScheduler.setProgressFunction(new NSProgressFunction() { + @Override + public void progress(MapData lastProcessedMap) { + if ( lastProcessedMap.refContext != null ) + // note, need to use getStopLocation so we don't give an interval to ProgressMeterDaemon + printProgress(lastProcessedMap.refContext.getLocus().getStopLocation()); + } + }); + } + + @Override + public String getTraversalUnits() { + return "reads"; + } + + /** + * Traverse by reads, given the data and the walker + * + * @param walker the walker to traverse with + * @param dataProvider the provider of the reads data + * @param sum the value of type T, specified by the walker, to feed to the walkers reduce function + * @return the reduce variable of the read walker + */ + public T traverse(ReadWalker walker, + ReadShardDataProvider dataProvider, + T sum) { + if ( logger.isDebugEnabled() ) + logger.debug(String.format("TraverseReadsNano.traverse Covered dataset is %s", dataProvider)); + + if( !dataProvider.hasReads() ) + throw new IllegalArgumentException("Unable to traverse reads; no read data is available."); + + nanoScheduler.setDebug(DEBUG); + final TraverseReadsMap myMap = new TraverseReadsMap(walker); + final TraverseReadsReduce myReduce = new TraverseReadsReduce(walker); + + final Iterator aggregatedInputs = aggregateMapData(dataProvider); + final T result = nanoScheduler.execute(aggregatedInputs, myMap, sum, myReduce); + + return result; + } + + /** + * Aggregate all of the inputs for all map calls into MapData, to be provided + * to NanoScheduler for Map/Reduce + * + * @param dataProvider the source of our data + * @return a linked list of MapData objects holding the read, ref, and ROD info for every map/reduce + * should execute + */ + private Iterator aggregateMapData(final ReadShardDataProvider dataProvider) { + final Iterator it = makeDataIterator(dataProvider); + if ( PRE_READ_ALL_MAP_DATA ) { + final LinkedList l = new LinkedList(); + while ( it.hasNext() ) l.add(it.next()); + return l.iterator(); + } else { + return it; + } + } + + + private Iterator makeDataIterator(final ReadShardDataProvider dataProvider) { + return new Iterator () { + final ReadView reads = new ReadView(dataProvider); + final ReadReferenceView reference = new ReadReferenceView(dataProvider); + final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); + final Iterator readIterator = reads.iterator(); + + @Override public boolean hasNext() { return ! engine.exceedsRuntimeLimit() && readIterator.hasNext(); } + + @Override + public MapData next() { + final SAMRecord read = readIterator.next(); + final ReferenceContext refContext = ! read.getReadUnmappedFlag() + ? reference.getReferenceContext(read) + : null; + + // if the read is mapped, create a metadata tracker + final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 + ? rodView.getReferenceOrderedDataForRead(read) + : null; + + // update the number of reads we've seen + dataProvider.getShard().getReadMetrics().incrementNumIterations(); + + return new MapData((GATKSAMRecord)read, refContext, tracker); + } + + @Override public void remove() { + throw new UnsupportedOperationException("Remove not supported"); + } + }; + } + + @Override + public void shutdown() { + nanoScheduler.shutdown(); + } + + /** + * The input data needed for each map call. The read, the reference, and the RODs + */ + private class MapData { + final GATKSAMRecord read; + final ReferenceContext refContext; + final RefMetaDataTracker tracker; + + private MapData(GATKSAMRecord read, ReferenceContext refContext, RefMetaDataTracker tracker) { + this.read = read; + this.refContext = refContext; + this.tracker = tracker; + } + } + + /** + * Contains the results of a map call, indicating whether the call was good, filtered, or done + */ + private class MapResult { + final M value; + final boolean reduceMe; + + /** + * Create a MapResult with value that should be reduced + * + * @param value the value to reduce + */ + private MapResult(final M value) { + this.value = value; + this.reduceMe = true; + } + + /** + * Create a MapResult that shouldn't be reduced + */ + private MapResult() { + this.value = null; + this.reduceMe = false; + } + } + + /** + * A static object that tells reduce that the result of map should be skipped (filtered or done) + */ + private final MapResult SKIP_REDUCE = new MapResult(); + + /** + * MapFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Applies walker.map to MapData, returning a MapResult object containing the result + */ + private class TraverseReadsMap implements NSMapFunction { + final ReadWalker walker; + + private TraverseReadsMap(ReadWalker walker) { + this.walker = walker; + } + + @Override + public MapResult apply(final MapData data) { + if ( ! walker.isDone() ) { + final boolean keepMeP = walker.filter(data.refContext, data.read); + if (keepMeP) + return new MapResult(walker.map(data.refContext, data.read, data.tracker)); + } + + return SKIP_REDUCE; + } + } + + /** + * NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable + */ + private class TraverseReadsReduce implements NSReduceFunction { + final ReadWalker walker; + + private TraverseReadsReduce(ReadWalker walker) { + this.walker = walker; + } + + @Override + public T apply(MapResult one, T sum) { + if ( one.reduceMe ) + // only run reduce on values that aren't DONE or FAILED + return walker.reduce(one.value, sum); + else + return sum; + } + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/package-info.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/package-info.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/package-info.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionTraversalParameters.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionTraversalParameters.java new file mode 100644 index 000000000..bcae8ecdc --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionTraversalParameters.java @@ -0,0 +1,97 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import org.broadinstitute.gatk.utils.activeregion.BandPassActivityProfile; + +import java.lang.annotation.Documented; +import java.lang.annotation.Inherited; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; + +/** + * Describes the parameters that this walker requires of the active region traversal + * + * User: rpoplin + * Date: 1/18/12 + */ +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) + +public @interface ActiveRegionTraversalParameters { + /** + * How far to either side of the active region itself should we include reads? + * + * That is, if the active region is 10 bp wide, and extension is 5, ART will provide + * the walker with active regions 10 bp, with 5 bp of extension on either side, and + * all reads that cover the 20 bp of the region + extension. + * + * @return the size of the active region extension we'd like + */ + public int extension() default 0; + + /** + * The minimum number of bp for an active region, when we need to chop it up into pieces because + * it's become too big. This only comes into effect when there's literally no good place to chop + * that does make the region smaller than this value. + * + * @return the min size in bp of regions + */ + public int minRegion() default 50; + + /** + * The maximum size in bp of active regions wanted by this walker + * + * Active regions larger than this value are automatically cut up by ART into smaller + * regions of size <= this value. + * + * @return the max size in bp of regions + */ + public int maxRegion() default 1500; + + /** + * The variance value for the Gaussian kernel of the band pass filter employed by ART + * @return the breadth of the band pass gaussian kernel we want for our traversal + */ + public double bandPassSigma() default BandPassActivityProfile.DEFAULT_SIGMA; + + /** + * What is the maximum number of reads we're willing to hold in memory per sample + * during the traversal? This limits our exposure to unusually large amounts + * of coverage in the engine. + * @return the maximum number of reads we're willing to hold in memory + */ + public int maxReadsToHoldInMemoryPerSample() default 30000; + + /** + * No matter what the per sample value says, we will never hold more than this + * number of reads in memory at any time. Provides an upper bound on the total number + * of reads in the case where we have a lot of samples. + * @return the maximum number of reads to hold in memory + */ + public int maxReadsToHoldTotal() default 10000000; +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionWalker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionWalker.java new file mode 100644 index 000000000..eb964c826 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionWalker.java @@ -0,0 +1,196 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import com.google.java.contract.Ensures; +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.utils.commandline.*; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.engine.filters.*; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.GenomeLocSortedSet; +import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; +import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState; +import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState; +import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; +import org.broadinstitute.gatk.utils.interval.IntervalSetRule; +import org.broadinstitute.gatk.utils.interval.IntervalUtils; + +import java.io.PrintStream; +import java.util.*; + +/** + * Base class for all the Active Region Walkers. + * User: rpoplin + * Date: 12/7/11 + */ + +@By(DataSource.READS) +@Requires({DataSource.READS, DataSource.REFERENCE}) +@PartitionBy(PartitionType.READ) +@ActiveRegionTraversalParameters(extension=50,maxRegion=1500) +@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, MappingQualityUnavailableFilter.class}) +@Downsample(by = DownsampleType.BY_SAMPLE, toCoverage = 1000) +@RemoveProgramRecords +public abstract class ActiveRegionWalker extends Walker { + /** + * If provided, this walker will write out its activity profile (per bp probabilities of being active) + * to this file in the IGV formatted TAB deliminated output: + * + * http://www.broadinstitute.org/software/igv/IGV + * + * Intended to make debugging the activity profile calculations easier + */ + @Output(fullName="activityProfileOut", shortName="APO", doc="Output the raw activity profile results in IGV format", required = false, defaultToStdout = false) + public PrintStream activityProfileOutStream = null; + + /** + * If provided, this walker will write out its active and inactive regions + * to this file in the IGV formatted TAB deliminated output: + * + * http://www.broadinstitute.org/software/igv/IGV + * + * Intended to make debugging the active region calculations easier + */ + @Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this IGV formatted file", required = false, defaultToStdout = false) + public PrintStream activeRegionOutStream = null; + + @Advanced + @Input(fullName="activeRegionIn", shortName="AR", doc="Use this interval list file as the active regions to process", required = false) + protected List> activeRegionBindings = null; + + @Advanced + @Argument(fullName="activeRegionExtension", shortName="activeRegionExtension", doc="The active region extension; if not provided defaults to Walker annotated default", required = false) + public Integer activeRegionExtension = null; + + /** + * For the active region walker to treat all bases as active. Useful for debugging when you want to force something like + * the HaplotypeCaller to process a specific interval you provide the GATK + */ + @Advanced + @Argument(fullName="forceActive", shortName="forceActive", doc="If provided, all bases will be tagged as active", required = false) + public boolean forceActive = false; + + @Advanced + @Argument(fullName="activeRegionMaxSize", shortName="activeRegionMaxSize", doc="The active region maximum size; if not provided defaults to Walker annotated default", required = false) + public Integer activeRegionMaxSize = null; + + @Advanced + @Argument(fullName="bandPassSigma", shortName="bandPassSigma", doc="The sigma of the band pass filter Gaussian kernel; if not provided defaults to Walker annotated default", required = false) + public Double bandPassSigma = null; + + /* + * For active region limits in ActivityProfile +* */ + @Hidden + @Argument(fullName = "maxProbPropagationDistance", shortName = "maxProbPropDist", minValue = 0, doc="Region probability propagation distance beyond it's maximum size.", required = false) + public Integer maxProbPropagationDistance = 50; + + @Advanced + @Argument(fullName = "activeProbabilityThreshold", shortName = "ActProbThresh", minValue = 0.0, maxValue = 1.0, doc="Threshold for the probability of a profile state being active.", required = false) + public Double activeProbThreshold = 0.002; + + private GenomeLocSortedSet presetActiveRegions = null; + + @Override + public void initialize() { + if( activeRegionBindings == null ) { return; } + List allIntervals = new ArrayList(0); + for ( IntervalBinding intervalBinding : activeRegionBindings ) { + List intervals = intervalBinding.getIntervals(this.getToolkit().getGenomeLocParser()); + + if ( intervals.isEmpty() ) { + logger.warn("The interval file " + intervalBinding.getSource() + " contains no intervals that could be parsed."); + } + + allIntervals = IntervalUtils.mergeListsBySetOperator(intervals, allIntervals, IntervalSetRule.UNION); + } + + presetActiveRegions = IntervalUtils.sortAndMergeIntervals(this.getToolkit().getGenomeLocParser(), allIntervals, IntervalMergingRule.ALL); + } + + /** + * Does this walker want us to use a set of preset action regions instead of dynamically using the result of isActive? + * @return true if yes, false if no + */ + public boolean hasPresetActiveRegions() { + return presetActiveRegions != null; + } + + /** + * Get the set of preset active regions, or null if none were provided + * @return a set of genome locs specifying fixed active regions requested by the walker, or null if none exist + */ + public GenomeLocSortedSet getPresetActiveRegions() { + return presetActiveRegions; + } + + // Do we actually want to operate on the context? + public boolean filter(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + return true; // We are keeping all the reads + } + + public EnumSet desiredReadStates() { + return EnumSet.of(ActiveRegionReadState.PRIMARY); + } + + public final boolean wantsNonPrimaryReads() { + return desiredReadStates().contains(ActiveRegionReadState.NONPRIMARY); + } + + public boolean wantsExtendedReads() { + return desiredReadStates().contains(ActiveRegionReadState.EXTENDED); + } + + public boolean wantsUnmappedReads() { + return desiredReadStates().contains(ActiveRegionReadState.UNMAPPED); + } + + // Determine probability of active status over the AlignmentContext + @Ensures({"result.isActiveProb >= 0.0", "result.isActiveProb <= 1.0"}) + public abstract ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context); + + // Map over the ActiveRegion + public abstract MapType map(final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker); + + public final GenomeLocSortedSet extendIntervals( final GenomeLocSortedSet intervals, final GenomeLocParser genomeLocParser, IndexedFastaSequenceFile reference ) { + final int activeRegionExtension = this.getClass().getAnnotation(ActiveRegionTraversalParameters.class).extension(); + final List allIntervals = new ArrayList(); + for( final GenomeLoc interval : intervals.toList() ) { + final int start = Math.max( 1, interval.getStart() - activeRegionExtension ); + final int stop = Math.min( reference.getSequenceDictionary().getSequence(interval.getContig()).getSequenceLength(), interval.getStop() + activeRegionExtension ); + allIntervals.add( genomeLocParser.createGenomeLoc(interval.getContig(), start, stop) ); + } + return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, IntervalMergingRule.ALL); + } + + +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Allows.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Allows.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Allows.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Allows.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Attribution.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Attribution.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Attribution.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Attribution.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/BAQMode.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/BAQMode.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/BAQMode.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/BAQMode.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/By.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/By.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/By.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/By.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/DataSource.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/DataSource.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/DataSource.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/DataSource.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/DisabledReadFilters.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/DisabledReadFilters.java new file mode 100644 index 000000000..ccf09fd40 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/DisabledReadFilters.java @@ -0,0 +1,41 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import htsjdk.samtools.filter.SamRecordFilter; + +import java.lang.annotation.*; + +/** + * An annotation to describe which inherited ReadFilters to disable + */ +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface DisabledReadFilters { + public Class[] value() default {}; +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Downsample.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Downsample.java new file mode 100644 index 000000000..f85123ab6 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Downsample.java @@ -0,0 +1,47 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; + +import java.lang.annotation.*; + +/** + * Specifies a method for downsampling the reads passed to a given + * walker based on the input from that walker. + * + * @author hanna + * @version 0.1 + */ +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface Downsample { + DownsampleType by(); + int toCoverage() default -1; + double toFraction() default -1.0F; +} \ No newline at end of file diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/DuplicateWalker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/DuplicateWalker.java new file mode 100644 index 000000000..42398ec33 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/DuplicateWalker.java @@ -0,0 +1,57 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.engine.filters.NotPrimaryAlignmentFilter; +import org.broadinstitute.gatk.engine.filters.UnmappedReadFilter; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.util.List; +import java.util.Set; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 22, 2009 + * Time: 2:52:28 PM + * To change this template use File | Settings | File Templates. + */ +@Requires({DataSource.READS,DataSource.REFERENCE}) +@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class}) +public abstract class DuplicateWalker extends Walker { + // Do we actually want to operate on the context? + public boolean filter(GenomeLoc loc, AlignmentContext context, Set> readSets ) { + return true; // We are keeping all the reads + } + + public abstract MapType map(GenomeLoc loc, AlignmentContext context, Set> readSets ); + + // Given result of map function + public abstract ReduceType reduceInit(); + public abstract ReduceType reduce(MapType value, ReduceType sum); +} \ No newline at end of file diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/FailMethod.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/FailMethod.java new file mode 100644 index 000000000..3f8862975 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/FailMethod.java @@ -0,0 +1,63 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import htsjdk.samtools.SAMException; +import org.broadinstitute.gatk.engine.CommandLineGATK; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +public enum FailMethod { + MAP, + REDUCE, + TREE_REDUCE; + + /** + * Used by QC walkers to test that engine throws appropriate errors. + * Split from the walker in ErrorThrowing.java. + * @param exceptionToThrow Exception type to throw. + */ + public static void fail(final String exceptionToThrow) { + switch (exceptionToThrow) { + case "UserException": + throw new UserException("UserException"); + case "NullPointerException": + throw new NullPointerException(); + case "ReviewedGATKException": + throw new ReviewedGATKException("ReviewedGATKException"); + case "SamError1": + throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_1); + case "SamError2": + throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_2); + case "NoSpace1": + throw new htsjdk.samtools.util.RuntimeIOException(new java.io.IOException("No space left on device java.io.FileOutputStream.writeBytes(Native Method)")); + case "NoSpace2": + throw new SAMException("Exception writing BAM index file", new java.io.IOException("No space left on device java.io.FileOutputStream.writeBytes(Native Method)")); + default: + throw new UserException.BadArgumentValue("exception", "exception isn't a recognized value " + exceptionToThrow); + } + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/LocusWalker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/LocusWalker.java new file mode 100644 index 000000000..3c6268de3 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/LocusWalker.java @@ -0,0 +1,58 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.engine.filters.DuplicateReadFilter; +import org.broadinstitute.gatk.engine.filters.FailsVendorQualityCheckFilter; +import org.broadinstitute.gatk.engine.filters.NotPrimaryAlignmentFilter; +import org.broadinstitute.gatk.engine.filters.UnmappedReadFilter; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 22, 2009 + * Time: 2:52:28 PM + * To change this template use File | Settings | File Templates. + */ +@By(DataSource.READS) +@Requires({DataSource.READS,DataSource.REFERENCE}) +@PartitionBy(PartitionType.LOCUS) +@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class}) +@Downsample(by = DownsampleType.BY_SAMPLE, toCoverage = 1000) +@RemoveProgramRecords +public abstract class LocusWalker extends Walker { + // Do we actually want to operate on the context? + public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return true; // We are keeping all the reads + } + + // Map over the org.broadinstitute.gatk.engine.contexts.AlignmentContext + public abstract MapType map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context); +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/NanoSchedulable.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/NanoSchedulable.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/NanoSchedulable.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/NanoSchedulable.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/PartitionBy.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/PartitionBy.java new file mode 100644 index 000000000..200614e50 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/PartitionBy.java @@ -0,0 +1,40 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import java.lang.annotation.*; + +/** + * Allows the walker to indicate how to partition data it wants to consume. + */ +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface PartitionBy { + PartitionType value(); + boolean includeUnmapped() default false; +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/PartitionType.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/PartitionType.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/PartitionType.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/PartitionType.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/RMD.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/RMD.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/RMD.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/RMD.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadFilters.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadFilters.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadFilters.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadFilters.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadPairWalker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadPairWalker.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadPairWalker.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadPairWalker.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadWalker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadWalker.java new file mode 100644 index 000000000..8c59bc8eb --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadWalker.java @@ -0,0 +1,55 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 22, 2009 + * Time: 2:52:28 PM + * To change this template use File | Settings | File Templates. + */ +@Requires({DataSource.READS, DataSource.REFERENCE}) +@PartitionBy(value = PartitionType.READ, includeUnmapped = true) +public abstract class ReadWalker extends Walker { + public boolean requiresOrderedReads() { return false; } + + // Do we actually want to operate on the context? + /** Must return true for reads that need to be processed. Reads, for which this method return false will + * be skipped by the engine and never passed to the walker. + */ + public boolean filter(ReferenceContext ref, GATKSAMRecord read) { + // We are keeping all the reads + return true; + } + + // Map over the org.broadinstitute.gatk.engine.contexts.AlignmentContext + public abstract MapType map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker); +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/RefWalker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/RefWalker.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/RefWalker.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/RefWalker.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Reference.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Reference.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Reference.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Reference.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/RemoveProgramRecords.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/RemoveProgramRecords.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/RemoveProgramRecords.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/RemoveProgramRecords.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Requires.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Requires.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Requires.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Requires.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/RodWalker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/RodWalker.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/RodWalker.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/RodWalker.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/TreeReducible.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/TreeReducible.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/TreeReducible.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/TreeReducible.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Walker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Walker.java new file mode 100644 index 000000000..88c4ff388 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Walker.java @@ -0,0 +1,178 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import htsjdk.samtools.SAMSequenceDictionary; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.engine.CommandLineGATK; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.engine.filters.BadCigarFilter; +import org.broadinstitute.gatk.engine.filters.MalformedReadFilter; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; +import org.broadinstitute.gatk.engine.samples.Sample; +import org.broadinstitute.gatk.engine.samples.SampleDB; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.baq.BAQ; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; +import org.broadinstitute.gatk.engine.recalibration.BQSRMode; + +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: hanna + * Date: Mar 17, 2009 + * Time: 1:53:31 PM + * To change this template use File | Settings | File Templates. + */ +@ReadFilters({MalformedReadFilter.class,BadCigarFilter.class}) +@PartitionBy(PartitionType.NONE) +@Downsample(by = DownsampleType.NONE) +@BAQMode(QualityMode = BAQ.QualityMode.OVERWRITE_QUALS, ApplicationTime = ReadTransformer.ApplicationTime.ON_INPUT) +@BQSRMode(ApplicationTime = ReadTransformer.ApplicationTime.ON_INPUT) +@DocumentedGATKFeature(groupName = "Uncategorized", extraDocs = {CommandLineGATK.class}) +public abstract class Walker { + final protected static Logger logger = Logger.getLogger(Walker.class); + private GenomeAnalysisEngine toolkit; + + protected Walker() { + } + + /** + * Set the toolkit, for peering into internal structures that can't + * otherwise be read. + * @param toolkit The genome analysis toolkit. + */ + public void setToolkit(GenomeAnalysisEngine toolkit) { + this.toolkit = toolkit; + } + + /** + * Retrieve the toolkit, for peering into internal structures that can't + * otherwise be read. Use sparingly, and discuss uses with software engineering + * team. + * @return The genome analysis toolkit. + */ + protected GenomeAnalysisEngine getToolkit() { + return toolkit; + } + + /** + * Gets the master sequence dictionary for this walker + * @link GenomeAnalysisEngine.getMasterSequenceDictionary + * @return + */ + protected SAMSequenceDictionary getMasterSequenceDictionary() { + return getToolkit().getMasterSequenceDictionary(); + } + + public SampleDB getSampleDB() { + return getToolkit().getSampleDB(); + } + + protected Sample getSample(final String id) { + return getToolkit().getSampleDB().getSample(id); + } + + /** + * (conceptual static) method that states whether you want to see reads piling up at a locus + * that contain a deletion at the locus. + * + * ref: ATCTGA + * read1: ATCTGA + * read2: AT--GA + * + * Normally, the locus iterator only returns a list of read1 at this locus at position 3, but + * if this function returns true, then the system will return (read1, read2) with offsets + * of (3, -1). The -1 offset indicates a deletion in the read. + * + * @return false if you don't want to see deletions, or true if you do + */ + public boolean includeReadsWithDeletionAtLoci() { + return false; + } + + public void initialize() { } + + /** + * A function for overloading in subclasses providing a mechanism to abort early from a walker. + * + * If this ever returns true, then the Traversal engine will stop executing map calls + * and start the process of shutting down the walker in an orderly fashion. + * @return + */ + public boolean isDone() { + return false; + } + + /** + * Provide an initial value for reduce computations. + * @return Initial value of reduce. + */ + public abstract ReduceType reduceInit(); + + /** + * Reduces a single map with the accumulator provided as the ReduceType. + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return accumulator with result of the map taken into account. + */ + public abstract ReduceType reduce(MapType value, ReduceType sum); + + public void onTraversalDone(ReduceType result) { + logger.info("[REDUCE RESULT] Traversal result is: " + result); + } + + /** + * General interval reduce routine called after all of the traversals are done + * @param results interval reduce results + */ + public void onTraversalDone(List> results) { + for ( Pair result : results ) { + logger.info(String.format("[INTERVAL REDUCE RESULT] at %s ", result.getFirst())); + this.onTraversalDone(result.getSecond()); + } + } + + /** + * Return true if your walker wants to reduce each interval separately. Default is false. + * + * If you set this flag, several things will happen. + * + * The system will invoke reduceInit() once for each interval being processed, starting a fresh reduce + * Reduce will accumulate normally at each map unit in the interval + * However, onTraversalDone(reduce) will be called after each interval is processed. + * The system will call onTraversalDone( GenomeLoc -> reduce ), after all reductions are done, + * which is overloaded here to call onTraversalDone(reduce) for each location + * + * @return true if your walker wants to reduce each interval separately. + */ + public boolean isReduceByInterval() { + return false; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/WalkerName.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/WalkerName.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/WalkerName.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/WalkerName.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Window.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Window.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Window.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Window.java diff --git a/public/gatk-tools-public/src/main/resources/GATK_public.key b/public/gatk-engine/src/main/resources/GATK_public.key similarity index 100% rename from public/gatk-tools-public/src/main/resources/GATK_public.key rename to public/gatk-engine/src/main/resources/GATK_public.key diff --git a/public/gatk-tools-public/src/main/resources/org/broadinstitute/gatk/engine/phonehome/resources/GATK_AWS_access.key b/public/gatk-engine/src/main/resources/org/broadinstitute/gatk/engine/phonehome/resources/GATK_AWS_access.key similarity index 100% rename from public/gatk-tools-public/src/main/resources/org/broadinstitute/gatk/engine/phonehome/resources/GATK_AWS_access.key rename to public/gatk-engine/src/main/resources/org/broadinstitute/gatk/engine/phonehome/resources/GATK_AWS_access.key diff --git a/public/gatk-tools-public/src/main/resources/org/broadinstitute/gatk/engine/phonehome/resources/GATK_AWS_secret.key b/public/gatk-engine/src/main/resources/org/broadinstitute/gatk/engine/phonehome/resources/GATK_AWS_secret.key similarity index 100% rename from public/gatk-tools-public/src/main/resources/org/broadinstitute/gatk/engine/phonehome/resources/GATK_AWS_secret.key rename to public/gatk-engine/src/main/resources/org/broadinstitute/gatk/engine/phonehome/resources/GATK_AWS_secret.key diff --git a/public/gatk-tools-public/src/main/resources/org/broadinstitute/gatk/utils/recalibration/BQSR.R b/public/gatk-engine/src/main/resources/org/broadinstitute/gatk/engine/recalibration/BQSR.R similarity index 100% rename from public/gatk-tools-public/src/main/resources/org/broadinstitute/gatk/utils/recalibration/BQSR.R rename to public/gatk-engine/src/main/resources/org/broadinstitute/gatk/engine/recalibration/BQSR.R diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/CommandLineGATKUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/CommandLineGATKUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/CommandLineGATKUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/CommandLineGATKUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/EngineFeaturesIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/EngineFeaturesIntegrationTest.java new file mode 100644 index 000000000..fb498412a --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/EngineFeaturesIntegrationTest.java @@ -0,0 +1,793 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import htsjdk.samtools.*; +import htsjdk.tribble.readers.LineIterator; +import org.broadinstitute.gatk.engine.walkers.*; +import org.broadinstitute.gatk.utils.commandline.*; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.engine.filters.MappingQualityUnavailableFilter; +import org.broadinstitute.gatk.engine.filters.DuplicateReadFilter; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFCodec; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import org.broadinstitute.gatk.utils.variant.VCIterable; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.*; +import java.util.*; + +/** + * + */ +public class EngineFeaturesIntegrationTest extends WalkerTest { + private void testBadRODBindingInput(String type, String name, Class c) { + WalkerTestSpec spec = new WalkerTestSpec("-T TestPrintVariantsWalker -L 1:1 --variant:variant," + type + " " + + b37dbSNP132 + " -R " + b37KGReference + " -o %s", + 1, c); + executeTest(name, spec); + } + + @Test() private void testBadRODBindingInputType1() { + testBadRODBindingInput("beagle", "BEAGLE input to VCF expecting walker", UserException.BadArgumentValue.class); + } + + @Test() private void testBadRODBindingInputType3() { + testBadRODBindingInput("bed", "Bed input to VCF expecting walker", UserException.BadArgumentValue.class); + } + + @Test() private void testBadRODBindingInputTypeUnknownType() { + testBadRODBindingInput("bedXXX", "Unknown input to VCF expecting walker", UserException.UnknownTribbleType.class); + } + + private void testMissingFile(String name, String missingBinding) { + WalkerTestSpec spec = new WalkerTestSpec(missingBinding + " -R " + b37KGReference + " -o %s", + 1, UserException.CouldNotReadInputFile.class); + executeTest(name, spec); + } + + @Test() private void testMissingBAMnt1() { + testMissingFile("missing BAM", "-T TestPrintReadsWalker -I missing.bam -nt 1"); + } + @Test() private void testMissingBAMnt4() { + testMissingFile("missing BAM", "-T TestPrintReadsWalker -I missing.bam -nt 4"); + } + @Test() private void testMissingVCF() { + testMissingFile("missing VCF", "-T TestPrintVariantsWalker -V missing.vcf"); + } + @Test() private void testMissingInterval() { + testMissingFile("missing interval", "-T TestPrintReadsWalker -L missing.interval_list -I " + b37GoodBAM); + } + + + // -------------------------------------------------------------------------------- + // + // Test that our exceptions are coming back as we expect + // + // -------------------------------------------------------------------------------- + + private class EngineErrorHandlingTestProvider extends TestDataProvider { + final Class expectedException; + final String args; + final int iterationsToTest; + + public EngineErrorHandlingTestProvider(Class exceptedException, final String args) { + super(EngineErrorHandlingTestProvider.class); + this.expectedException = exceptedException; + this.args = args; + this.iterationsToTest = args.equals("") ? 1 : 10; + setName(String.format("Engine error handling: expected %s with args %s", exceptedException, args)); + } + } + + @DataProvider(name = "EngineErrorHandlingTestProvider") + public Object[][] makeEngineErrorHandlingTestProvider() { + for ( final FailMethod failMethod : FailMethod.values() ) { + if ( failMethod == FailMethod.TREE_REDUCE ) + continue; // cannot reliably throw errors in TREE_REDUCE + + final String failArg = " -fail " + failMethod.name(); + for ( final String args : Arrays.asList("", " -nt 2", " -nct 2") ) { + new EngineErrorHandlingTestProvider(NullPointerException.class, failArg + args); + new EngineErrorHandlingTestProvider(UserException.class, failArg + args); + new EngineErrorHandlingTestProvider(ReviewedGATKException.class, failArg + args); + } + } + + return EngineErrorHandlingTestProvider.getTests(EngineErrorHandlingTestProvider.class); + } + + // + // Loop over errors to throw, make sure they are the errors we get back from the engine, regardless of NT type + // + @Test(enabled = true, dataProvider = "EngineErrorHandlingTestProvider", timeOut = 60 * 1000 ) + public void testEngineErrorHandlingTestProvider(final EngineErrorHandlingTestProvider cfg) { + for ( int i = 0; i < cfg.iterationsToTest; i++ ) { + final String root = "-T TestErrorThrowingWalker -R " + exampleFASTA; + final String args = root + cfg.args + " -E " + cfg.expectedException.getSimpleName(); + WalkerTestSpec spec = new WalkerTestSpec(args, 0, cfg.expectedException); + + executeTest(cfg.toString(), spec); + } + } + + // -------------------------------------------------------------------------------- + // + // Test that read filters are being applied in the order we expect + // + // -------------------------------------------------------------------------------- + + @ReadFilters({MappingQualityUnavailableFilter.class, DuplicateReadFilter.class}) + @DisabledReadFilters({DuplicateReadFilter.class}) + public static class DummyReadWalkerWithFilters extends ReadWalker { + @Output + PrintStream out; + + @Override + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { + return 1; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } + + @Override + public void onTraversalDone(Integer result) { + out.println(result); + } + } + + @Test(enabled = true) + public void testUserReadFilterAppliedBeforeWalker() { + WalkerTestSpec spec = new WalkerTestSpec("-R " + b37KGReference + " -I " + privateTestDir + "allMAPQ255.bam" + + " -T DummyReadWalkerWithFilters -o %s -L MT -rf ReassignMappingQuality", + 1, Arrays.asList("ecf27a776cdfc771defab1c5d19de9ab")); + executeTest("testUserReadFilterAppliedBeforeWalker", spec); + } + + @Test(enabled = true) + public void testUserReadFilterDisabledAppliedBeforeWalker() { + WalkerTestSpec spec = new WalkerTestSpec("-R " + b37KGReference + " -I " + privateTestDir + "allMAPQ255.bam" + + " -T DummyReadWalkerWithFilters -o %s -L MT -drf DuplicateRead", + 1, Arrays.asList("897316929176464ebc9ad085f31e7284")); + executeTest("testUserReadFilterDisabledAppliedBeforeWalker", spec); + } + + @Test( enabled = true, expectedExceptions = RuntimeException.class ) + public void testUserReadFilterDisabledAppliedBeforeWalkerException() { + WalkerTestSpec spec = new WalkerTestSpec("-R " + b37KGReference + " -I " + privateTestDir + "allMAPQ255.bam" + + " -T DummyReadWalkerWithFilters -o %s -L MT -drf ReassignMappingQuality", + 1, Arrays.asList("")); + executeTest("testUserReadFilterDisabledAppliedBeforeWalkerException", spec); + } + + @Test + public void testNegativeCompress() { + testBadCompressArgument(-1); + } + + @Test + public void testTooBigCompress() { + testBadCompressArgument(100); + } + + private void testBadCompressArgument(final int compress) { + WalkerTestSpec spec = new WalkerTestSpec("-T TestPrintReadsWalker -R " + b37KGReference + " -I " + privateTestDir + "NA12878.1_10mb_2_10mb.bam -o %s -compress " + compress, + 1, UserException.class); + executeTest("badCompress " + compress, spec); + } + + // -------------------------------------------------------------------------------- + // + // Test that the VCF version key is what we expect + // + // -------------------------------------------------------------------------------- + @Test(enabled = true) + public void testGATKVersionInVCF() throws Exception { + WalkerTestSpec spec = new WalkerTestSpec("-T TestPrintVariantsWalker -R " + b37KGReference + + " -V " + privateTestDir + "NA12878.WGS.b37.chr20.firstMB.vcf" + + " -o %s -L 20:61098", + 1, Arrays.asList("")); + spec.disableShadowBCF(); + final File vcf = executeTest("testGATKVersionInVCF", spec).first.get(0); + final VCFCodec codec = new VCFCodec(); + final VCFHeader header = (VCFHeader) codec.readActualHeader(codec.makeSourceFromStream(new FileInputStream(vcf))); + + // go through the metadata headers and look for ones that start with the GATK_COMMAND_LINE_KEY + VCFHeaderLine versionLine = null; + for ( final VCFHeaderLine headerLine : header.getMetaDataInInputOrder()) { + if(headerLine.getKey().startsWith(GATKVCFUtils.GATK_COMMAND_LINE_KEY)) { + versionLine = headerLine; + break; + } + } + Assert.assertNotNull(versionLine); + Assert.assertTrue(versionLine.toString().contains("TestPrintVariantsWalker")); + } + + @Test(enabled = true) + public void testMultipleGATKVersionsInVCF() throws Exception { + WalkerTestSpec spec = new WalkerTestSpec("-T TestPrintVariantsWalker -R " + b37KGReference + + " -V " + privateTestDir + "gatkCommandLineInHeader.vcf" + + " -o %s", + 1, Arrays.asList("")); + spec.disableShadowBCF(); + final File vcf = executeTest("testMultipleGATKVersionsInVCF", spec).first.get(0); + final VCFCodec codec = new VCFCodec(); + final VCFHeader header = (VCFHeader) codec.readActualHeader(codec.makeSourceFromStream(new FileInputStream(vcf))); + + boolean foundHC = false; + boolean foundPV = false; + for ( final VCFHeaderLine line : header.getMetaDataInInputOrder() ) { + if ( line.getKey().startsWith(GATKVCFUtils.GATK_COMMAND_LINE_KEY) ) { + if ( line.toString().contains("HaplotypeCaller") ) { + Assert.assertFalse(foundHC); + foundHC = true; + } + if ( line.toString().contains("TestPrintVariantsWalker") ) { + Assert.assertFalse(foundPV); + foundPV = true; + } + } + } + + Assert.assertTrue(foundHC, "Didn't find HaplotypeCaller command line header field"); + Assert.assertTrue(foundPV, "Didn't find TestPrintVariantsWalker command line header field"); + } + + @Test(enabled = true) + public void testMultipleGATKVersionsSameWalkerInVCF() throws Exception { + WalkerTestSpec spec = new WalkerTestSpec("-T TestPrintVariantsWalker -R " + b37KGReference + + " -V " + privateTestDir + "gatkCommandLineExistsInHeader.vcf" + + " -o %s", + 1, Arrays.asList("")); + spec.disableShadowBCF(); + final File vcf = executeTest("testMultipleGATKVersionsSameWalkerInVCF", spec).first.get(0); + final VCFCodec codec = new VCFCodec(); + final VCFHeader header = (VCFHeader) codec.readActualHeader(codec.makeSourceFromStream(new FileInputStream(vcf))); + + boolean foundFirstWalker = false; + boolean foundSecondWalker = false; + for ( final VCFHeaderLine line : header.getMetaDataInInputOrder() ) { + if ( line.getKey().startsWith(GATKVCFUtils.GATK_COMMAND_LINE_KEY) ) { + // check if we found the second walker command line header field key + if ( line.getKey().contains("TestPrintVariantsWalker.2") ) { + Assert.assertFalse(foundSecondWalker); + foundSecondWalker = true; + } + // otherwise if this is not the second walker command but contains the same + // walker name, then it is the first occurrence. If we somehow got more than + // two occurrences of this walker, the Assert.assertFalse(foundFirstWalker); + // will catch this + else if ( line.getKey().contains("TestPrintVariantsWalker") ) { + Assert.assertFalse(foundFirstWalker); + foundFirstWalker = true; + } + } + } + + Assert.assertTrue(foundFirstWalker, "Didn't find TestPrintVariantsWalker command line header field"); + Assert.assertTrue(foundSecondWalker, "Didn't find (second) TestPrintVariantsWalker command line header field"); + } + + // -------------------------------------------------------------------------------- + // + // Test that defaultBaseQualities actually works + // + // -------------------------------------------------------------------------------- + + public WalkerTestSpec testDefaultBaseQualities(final Integer value, final String md5) { + return new WalkerTestSpec("-T TestPrintReadsWalker -R " + b37KGReference + " -I " + privateTestDir + "/baseQualitiesToFix.bam -o %s" + + (value != null ? " --defaultBaseQualities " + value : ""), + 1, Arrays.asList(md5)); + } + + @Test() + public void testDefaultBaseQualities20() { + executeTest("testDefaultBaseQualities20", testDefaultBaseQualities(20, "7d254a9d0ec59c66ee3e137f56f4c78f")); + } + + @Test() + public void testDefaultBaseQualities30() { + executeTest("testDefaultBaseQualities30", testDefaultBaseQualities(30, "0f50def6cbbbd8ccd4739e2b3998e503")); + } + + @Test(expectedExceptions = Exception.class) + public void testDefaultBaseQualitiesNoneProvided() { + executeTest("testDefaultBaseQualitiesNoneProvided", testDefaultBaseQualities(null, "")); + } + + // -------------------------------------------------------------------------------- + // + // Test engine-level cigar consolidation + // + // -------------------------------------------------------------------------------- + + @Test + public void testGATKEngineConsolidatesCigars() { + final WalkerTestSpec spec = new WalkerTestSpec(" -T TestPrintReadsWalker" + + " -R " + b37KGReference + + " -I " + privateTestDir + "zero_length_cigar_elements.bam" + + " -o %s", + 1, Arrays.asList("")); // No MD5s; we only want to check the cigar + + final File outputBam = executeTest("testGATKEngineConsolidatesCigars", spec).first.get(0); + final SAMFileReader reader = new SAMFileReader(outputBam); + reader.setValidationStringency(ValidationStringency.SILENT); + + final SAMRecord read = reader.iterator().next(); + reader.close(); + + // Original cigar was 0M3M0M8M. Check that it's been consolidated after running through the GATK engine: + Assert.assertEquals(read.getCigarString(), "11M", "Cigar 0M3M0M8M not consolidated correctly by the engine"); + } + + // -------------------------------------------------------------------------------- + // + // Test on-the-fly sample renaming + // + // -------------------------------------------------------------------------------- + + // On-the-fly sample renaming test case: one single-sample bam with multiple read groups + @Test + public void testOnTheFlySampleRenamingWithSingleBamFile() throws IOException { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam myNewSampleName")); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T TestPrintReadsWalker" + + " -R " + b37KGReference + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, Arrays.asList("")); // No MD5s; we only want to check the read groups + + final File outputBam = executeTest("testOnTheFlySampleRenamingWithSingleBamFile", spec).first.get(0); + final SAMFileReader reader = new SAMFileReader(outputBam); + + for ( final SAMReadGroupRecord readGroup : reader.getFileHeader().getReadGroups() ) { + Assert.assertEquals(readGroup.getSample(), "myNewSampleName", String.format("Sample for read group %s not renamed correctly", readGroup.getId())); + } + + reader.close(); + } + + // On-the-fly sample renaming test case: three single-sample bams with multiple read groups per bam + @Test + public void testOnTheFlySampleRenamingWithMultipleBamFiles() throws IOException { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam newSampleFor12878", + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12891.HEADERONLY.bam newSampleFor12891", + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam newSampleFor12892")); + + final Map readGroupToNewSampleMap = new HashMap<>(); + for ( String inputBamID : Arrays.asList("12878", "12891", "12892") ) { + final File inputBam = new File(privateTestDir + String.format("CEUTrio.HiSeq.WGS.b37.NA%s.HEADERONLY.bam", inputBamID)); + final SAMFileReader inputBamReader = new SAMFileReader(inputBam); + final String newSampleName = String.format("newSampleFor%s", inputBamID); + for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) { + readGroupToNewSampleMap.put(readGroup.getId(), newSampleName); + } + inputBamReader.close(); + } + + final WalkerTestSpec spec = new WalkerTestSpec(" -T TestPrintReadsWalker" + + " -R " + b37KGReference + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12891.HEADERONLY.bam" + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, Arrays.asList("")); // No MD5s; we only want to check the read groups + + final File outputBam = executeTest("testOnTheFlySampleRenamingWithMultipleBamFiles", spec).first.get(0); + final SAMFileReader outputBamReader = new SAMFileReader(outputBam); + + int totalReadGroupsSeen = 0; + for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) { + Assert.assertEquals(readGroup.getSample(), readGroupToNewSampleMap.get(readGroup.getId()), + String.format("Wrong sample for read group %s after on-the-fly renaming", readGroup.getId())); + totalReadGroupsSeen++; + } + + Assert.assertEquals(totalReadGroupsSeen, readGroupToNewSampleMap.size(), "Wrong number of read groups encountered in output bam file"); + + outputBamReader.close(); + } + + // On-the-fly sample renaming test case: three single-sample bams with multiple read groups per bam, + // performing renaming in only SOME of the bams + @Test + public void testOnTheFlySampleRenamingWithMultipleBamFilesPartialRename() throws IOException { + // Rename samples for NA12878 and NA12892, but not for NA12891 + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam newSampleFor12878", + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam newSampleFor12892")); + + final Map readGroupToNewSampleMap = new HashMap<>(); + for ( String inputBamID : Arrays.asList("12878", "12891", "12892") ) { + final File inputBam = new File(privateTestDir + String.format("CEUTrio.HiSeq.WGS.b37.NA%s.HEADERONLY.bam", inputBamID)); + final SAMFileReader inputBamReader = new SAMFileReader(inputBam); + + // Special-case NA12891, which we're not renaming: + final String newSampleName = inputBamID.equals("12891") ? "NA12891" : String.format("newSampleFor%s", inputBamID); + + for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) { + readGroupToNewSampleMap.put(readGroup.getId(), newSampleName); + } + inputBamReader.close(); + } + + final WalkerTestSpec spec = new WalkerTestSpec(" -T TestPrintReadsWalker" + + " -R " + b37KGReference + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12891.HEADERONLY.bam" + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, Arrays.asList("")); // No MD5s; we only want to check the read groups + + final File outputBam = executeTest("testOnTheFlySampleRenamingWithMultipleBamFilesPartialRename", spec).first.get(0); + final SAMFileReader outputBamReader = new SAMFileReader(outputBam); + + int totalReadGroupsSeen = 0; + for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) { + Assert.assertEquals(readGroup.getSample(), readGroupToNewSampleMap.get(readGroup.getId()), + String.format("Wrong sample for read group %s after on-the-fly renaming", readGroup.getId())); + totalReadGroupsSeen++; + } + + Assert.assertEquals(totalReadGroupsSeen, readGroupToNewSampleMap.size(), "Wrong number of read groups encountered in output bam file"); + + outputBamReader.close(); + } + + // On-the-fly sample renaming test case: two single-sample bams with read group collisions + @Test + public void testOnTheFlySampleRenamingWithReadGroupCollisions() throws IOException { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam newSampleFor12878", + privateTestDir + "CEUTrio.HiSeq.WGS.b37.READ_GROUP_COLLISIONS_WITH_NA12878.HEADERONLY.bam newSampleForNot12878")); + + final Set na12878ReadGroups = new HashSet<>(); + final SAMFileReader inputBamReader = new SAMFileReader(new File(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam")); + for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) { + na12878ReadGroups.add(readGroup.getId()); + } + inputBamReader.close(); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T TestPrintReadsWalker" + + " -R " + b37KGReference + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.READ_GROUP_COLLISIONS_WITH_NA12878.HEADERONLY.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, Arrays.asList("")); // No MD5s; we only want to check the read groups + + final File outputBam = executeTest("testOnTheFlySampleRenamingWithReadGroupCollisions", spec).first.get(0); + final SAMFileReader outputBamReader = new SAMFileReader(outputBam); + + int totalReadGroupsSeen = 0; + for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) { + String expectedSampleName = ""; + if ( na12878ReadGroups.contains(readGroup.getId()) ) { + expectedSampleName = "newSampleFor12878"; + } + else { + expectedSampleName = "newSampleForNot12878"; + } + + Assert.assertEquals(readGroup.getSample(), expectedSampleName, + String.format("Wrong sample for read group %s after on-the-fly renaming", readGroup.getId())); + totalReadGroupsSeen++; + } + + Assert.assertEquals(totalReadGroupsSeen, na12878ReadGroups.size() * 2, "Wrong number of read groups encountered in output bam file"); + + outputBamReader.close(); + } + + // On-the-fly sample renaming test case: a multi-sample bam (this should generate a UserException) + @Test + public void testOnTheFlySampleRenamingWithMultiSampleBam() throws IOException { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.MERGED.HEADERONLY.bam myNewSampleName")); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T TestPrintReadsWalker" + + " -R " + b37KGReference + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.MERGED.HEADERONLY.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, + UserException.class); // expecting a UserException here + + executeTest("testOnTheFlySampleRenamingWithMultiSampleBam", spec); + } + + // On-the-fly sample renaming test case: ensure that walkers can see the remapped sample names in individual reads + @Test + public void testOnTheFlySampleRenamingVerifyWalkerSeesNewSamplesInReads() throws IOException { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam myNewSampleName")); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T OnTheFlySampleRenamingVerifyingTestWalker" + + " -R " + b37KGReference + + " -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " --newSampleName myNewSampleName" + + " -L 20:10000000-10001000", + 1, Arrays.asList("")); + + // Test is a success if our custom walker doesn't throw an exception + executeTest("testOnTheFlySampleRenamingVerifyWalkerSeesNewSamplesInReads", spec); + } + + @Test + public void testOnTheFlySampleRenamingSingleSampleVCF() throws IOException { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "NA12878.WGS.b37.chr20.firstMB.vcf newSampleForNA12878")); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T TestPrintVariantsWalker" + + " -R " + b37KGReference + + " -V " + privateTestDir + "NA12878.WGS.b37.chr20.firstMB.vcf" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, + Arrays.asList("")); // No MD5s -- we will inspect the output file manually + + final File outputVCF = executeTest("testOnTheFlySampleRenamingSingleSampleVCF", spec).first.get(0); + verifySampleRenaming(outputVCF, "newSampleForNA12878"); + } + + private void verifySampleRenaming( final File outputVCF, final String newSampleName ) throws IOException { + final Pair> headerAndVCIter = VCIterable.readAllVCs(outputVCF, new VCFCodec()); + final VCFHeader header = headerAndVCIter.getFirst(); + final VCIterable iter = headerAndVCIter.getSecond(); + + // Verify that sample renaming occurred at both the header and record levels (checking only the first 10 records): + + Assert.assertEquals(header.getGenotypeSamples().size(), 1, "Wrong number of samples in output vcf header"); + Assert.assertEquals(header.getGenotypeSamples().get(0), newSampleName, "Wrong sample name in output vcf header"); + + int recordCount = 0; + while ( iter.hasNext() && recordCount < 10 ) { + final VariantContext vcfRecord = iter.next(); + Assert.assertEquals(vcfRecord.getSampleNames().size(), 1, "Wrong number of samples in output vcf record"); + Assert.assertEquals(vcfRecord.getSampleNames().iterator().next(), newSampleName, "Wrong sample name in output vcf record"); + recordCount++; + } + } + + @Test + public void testOnTheFlySampleRenamingVerifyWalkerSeesNewSamplesInVCFRecords() throws Exception { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "samplerenametest_single_sample_gvcf.vcf FOOSAMPLE")); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T OnTheFlySampleRenamingVerifyingRodWalker" + + " -R " + hg19Reference + + " -V " + privateTestDir + "samplerenametest_single_sample_gvcf.vcf" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " --expectedSampleName FOOSAMPLE" + + " -o %s", + 1, + Arrays.asList("")); // No MD5s -- custom walker will throw an exception if there's a problem + + executeTest("testOnTheFlySampleRenamingVerifyWalkerSeesNewSamplesInVCFRecords", spec); + } + + @Test + public void testOnTheFlySampleRenamingMultiSampleVCF() throws Exception { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "vcf/vcfWithGenotypes.vcf badSample")); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T TestPrintVariantsWalker" + + " -R " + b37KGReference + + " -V " + privateTestDir + "vcf/vcfWithGenotypes.vcf" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, + UserException.class); // expecting a UserException here + + executeTest("testOnTheFlySampleRenamingMultiSampleVCF", spec); + } + + @Test + public void testOnTheFlySampleRenamingSitesOnlyVCF() throws Exception { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "vcf/vcfWithoutGenotypes.vcf badSample")); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T TestPrintVariantsWalker" + + " -R " + b37KGReference + + " -V " + privateTestDir + "vcf/vcfWithoutGenotypes.vcf" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, + UserException.class); // expecting a UserException here + + executeTest("testOnTheFlySampleRenamingSitesOnlyVCF", spec); + } + + private File createTestSampleRenameMapFile( final List contents ) throws IOException { + final File mapFile = createTempFile("TestSampleRenameMapFile", ".tmp"); + final PrintWriter writer = new PrintWriter(mapFile); + + for ( final String line : contents ) { + writer.println(line); + } + writer.close(); + + return mapFile; + } + + public static class OnTheFlySampleRenamingVerifyingTestWalker extends ReadWalker { + @Argument(fullName = "newSampleName", shortName = "newSampleName", doc = "", required = true) + String newSampleName = null; + + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { + if ( ! newSampleName.equals(read.getReadGroup().getSample()) ) { + throw new IllegalStateException(String.format("Encountered read with the wrong sample name. Expected %s found %s", + newSampleName, read.getReadGroup().getSample())); + } + + return 1; + } + + public Integer reduceInit() { return 0; } + public Integer reduce(Integer value, Integer sum) { return value + sum; } + } + + public static class OnTheFlySampleRenamingVerifyingRodWalker extends RodWalker { + @Argument(fullName = "expectedSampleName", shortName = "expectedSampleName", doc = "", required = true) + String expectedSampleName = null; + + @Output + PrintStream out; + + @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) + public RodBinding variants; + + public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { + if ( tracker == null ) { + return 0; + } + + for ( final VariantContext vc : tracker.getValues(variants, context.getLocation()) ) { + if ( vc.getSampleNames().size() != 1 ) { + throw new IllegalStateException("Encountered a vcf record with num samples != 1"); + } + + final String actualSampleName = vc.getSampleNames().iterator().next(); + if ( ! expectedSampleName.equals(actualSampleName)) { + throw new IllegalStateException(String.format("Encountered vcf record with wrong sample name. Expected %s found %s", + expectedSampleName, actualSampleName)); + } + } + + return 1; + } + + public Integer reduceInit() { + return 0; + } + + public Integer reduce(Integer counter, Integer sum) { + return counter + sum; + } + } + + // -------------------------------------------------------------------------------- + // + // Test output file-specific options + // + // -------------------------------------------------------------------------------- + + //Returns the output file + private File testBAMFeatures(final String args, final String md5) { + WalkerTestSpec spec = new WalkerTestSpec("-T TestPrintReadsWalker -R " + b37KGReference + + " -I " + privateTestDir + "NA20313.highCoverageRegion.bam" + + " --no_pg_tag -o %s " + args, + 1, Arrays.asList(".bam"), Arrays.asList(md5)); + return executeTest("testBAMFeatures: "+args, spec).first.get(0); + } + + @Test + public void testSAMWriterFeatures() { + testBAMFeatures("-compress 0", "bb4b55b1f80423970bb9384cbf0d8793"); + testBAMFeatures("-compress 9", "b85ee1636d62e1bb8ed65a245c307167"); + testBAMFeatures("-simplifyBAM", "38f9c30a27dfbc085a2ff52a1617d579"); + + //Validate MD5 + final String expectedMD5 = "6627b9ea33293a0083983feb94948c1d"; + final File md5Target = testBAMFeatures("--generate_md5", expectedMD5); + final File md5File = new File(md5Target.getAbsoluteFile() + ".md5"); + md5File.deleteOnExit(); + Assert.assertTrue(md5File.exists(), "MD5 wasn't created"); + try { + String md5 = new BufferedReader(new FileReader(md5File)).readLine(); + Assert.assertEquals(md5, expectedMD5, "Generated MD5 doesn't match expected"); + } catch (IOException e) { + Assert.fail("Can't parse MD5 file", e); + } + + //Validate that index isn't created + final String unindexedBAM = testBAMFeatures("--disable_bam_indexing", expectedMD5).getAbsolutePath(); + Assert.assertTrue(!(new File(unindexedBAM+".bai").exists()) && + !(new File(unindexedBAM.replace(".bam", ".bai")).exists()), + "BAM index was created even though it was disabled"); + } + + @DataProvider(name = "vcfFeaturesData") + public Object[][] getVCFFeaturesData() { + return new Object[][]{ + {"--sites_only", "94bf1f2c0946e933515e4322323a5716"}, + {"--bcf", "03f2d6988f54a332da48803c78f9c4b3"} + }; + } + + @Test(dataProvider = "vcfFeaturesData") + public void testVCFFeatures(final String args, final String md5) { + WalkerTestSpec spec = new WalkerTestSpec("-T TestPrintVariantsWalker -R " + b37KGReference + + " -V " + privateTestDir + "CEUtrioTest.vcf" + + " --no_cmdline_in_header -o %s " + args, + 1, Arrays.asList(md5)); + executeTest("testVCFFeatures: "+args, spec); + } + + @DataProvider(name = "vcfFormatHandlingData") + public Object[][] getVCFFormatHandlingData() { + return new Object[][]{ + {true, "95b6262efbd40b6b72f44f808f3e4c45"}, + {false, "333232e08b8cdd3303309e438c44277f"} + }; + } + + @Test(dataProvider = "vcfFormatHandlingData") + public void testVCFFormatHandling(final boolean writeFullFormat, final String md5) { + WalkerTestSpec spec = new WalkerTestSpec("-T TestPrintVariantsWalker -R " + b37KGReference + + " -V " + privateTestDir + "ILLUMINA.wex.broad_phase2_baseline.20111114.both.exome.genotypes.1000.vcf" + + " --no_cmdline_in_header -o %s " + + " --fullyDecode " //Without this parameter, the FORMAT fields will be emitted unchanged. Oops + + (writeFullFormat ? "-writeFullFormat" : "") , + 1, Arrays.asList(md5)); + executeTest("testVCFFormatHandling: "+(writeFullFormat ? "Untrimmed" : "Trimmed"), spec); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/GATKVCFUtilsUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/GATKVCFUtilsUnitTest.java new file mode 100644 index 000000000..3881eb719 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/GATKVCFUtilsUnitTest.java @@ -0,0 +1,159 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import htsjdk.tribble.index.DynamicIndexCreator; +import htsjdk.tribble.index.IndexCreator; +import htsjdk.tribble.index.interval.IntervalIndexCreator; +import htsjdk.tribble.index.linear.LinearIndexCreator; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.engine.walkers.RodWalker; +import org.broadinstitute.gatk.engine.walkers.Walker; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.lang.reflect.Method; +import java.util.Arrays; +import java.util.Collections; +import java.util.Set; + +public class GATKVCFUtilsUnitTest extends BaseTest { + public static class VCFHeaderTestWalker extends RodWalker { + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { return null; } + public Integer reduceInit() { return 0; } + public Integer reduce(Integer value, Integer sum) { return value + sum; } + } + + public static class VCFHeaderTest2Walker extends VCFHeaderTestWalker {} + + @Test + public void testAddingVCFHeaderInfo() { + final VCFHeader header = new VCFHeader(); + + final Walker walker1 = new VCFHeaderTestWalker(); + final Walker walker2 = new VCFHeaderTest2Walker(); + + final GenomeAnalysisEngine testEngine1 = new GenomeAnalysisEngine(); + testEngine1.setWalker(walker1); + + final GenomeAnalysisEngine testEngine2 = new GenomeAnalysisEngine(); + testEngine2.setWalker(walker2); + + final VCFHeaderLine line1 = GATKVCFUtils.getCommandLineArgumentHeaderLine(header, testEngine1, Collections.EMPTY_LIST); + logger.warn(line1); + Assert.assertNotNull(line1); + // assert the key matches the expected format (GATKVCFUtils.GATK_COMMAND_LINE_KEY).(walker name) + final String expectedLine1Key = String.format("%s.%s", GATKVCFUtils.GATK_COMMAND_LINE_KEY, testEngine1.getWalkerName()); + Assert.assertEquals(line1.getKey(), expectedLine1Key); + + for (final String field : Arrays.asList("Version", "ID", "Date", "CommandLineOptions")) + Assert.assertTrue(line1.toString().contains(field), "Couldn't find field " + field + " in " + line1.getValue()); + Assert.assertTrue(line1.toString().contains("ID=" + testEngine1.getWalkerName())); + + final VCFHeaderLine line2 = GATKVCFUtils.getCommandLineArgumentHeaderLine(header, testEngine2, Collections.EMPTY_LIST); + logger.warn(line2); + + + header.addMetaDataLine(line1); + final Set lines1 = header.getMetaDataInInputOrder(); + Assert.assertTrue(lines1.contains(line1)); + + header.addMetaDataLine(line2); + final Set lines2 = header.getMetaDataInInputOrder(); + Assert.assertTrue(lines2.contains(line1)); + Assert.assertTrue(lines2.contains(line2)); + + // create a new header line using the same engine as used by line 1 + final VCFHeaderLine line3 = GATKVCFUtils.getCommandLineArgumentHeaderLine(header, testEngine1, Collections.EMPTY_LIST); + logger.warn(line3); + + // ensure convention followed by getCommandLineArgumentHeaderLine is to append ".(number of duplicate engine runs)" + // line3 uses the same walker as line1, whereas line2 uses a different walker. line3 is the second occurrence of walker1 + // so a ".2" gets appended afterwards + final String expectedLine3Key = String.format("%s.%s.2", GATKVCFUtils.GATK_COMMAND_LINE_KEY, testEngine1.getWalkerName()); + Assert.assertEquals(line3.getKey(), expectedLine3Key); + + header.addMetaDataLine(line3); + + final Set lines3 = header.getMetaDataInInputOrder(); + Assert.assertTrue(lines3.contains(line1)); + Assert.assertTrue(lines3.contains(line2)); + Assert.assertTrue(lines3.contains(line3)); + } + + private class IndexCreatorTest extends TestDataProvider { + private final GATKVCFIndexType type; + private final int parameter; + private final Class expectedClass; + private final Integer expectedDimension; + private final Method dimensionGetter; + + private IndexCreatorTest(GATKVCFIndexType type, int parameter, Class expectedClass, Integer expectedDimension, + String dimensionGetterName) { + super(IndexCreatorTest.class); + + this.type = type; + this.parameter = parameter; + this.expectedClass = expectedClass; + this.expectedDimension = expectedDimension; + try { + // Conditional matches testGetIndexCreator's if-statement + this.dimensionGetter = this.expectedDimension == null ? null : expectedClass.getDeclaredMethod(dimensionGetterName); + } catch (NoSuchMethodException e) { + throw new RuntimeException(e); + } + } + } + + @DataProvider(name = "indexCreator") + public Object[][] indexCreatorData() { + new IndexCreatorTest(GATKVCFIndexType.DYNAMIC_SEEK, 0, DynamicIndexCreator.class, null, null); + new IndexCreatorTest(GATKVCFIndexType.DYNAMIC_SIZE, 0, DynamicIndexCreator.class, null, null); + new IndexCreatorTest(GATKVCFIndexType.LINEAR, 100, LinearIndexCreator.class, 100, "getBinSize"); + new IndexCreatorTest(GATKVCFIndexType.INTERVAL, 200, IntervalIndexCreator.class, 200, "getFeaturesPerInterval"); + + return IndexCreatorTest.getTests(IndexCreatorTest.class); + } + + @Test(dataProvider = "indexCreator") + public void testGetIndexCreator(IndexCreatorTest spec) throws Exception{ + File dummy = new File(""); + IndexCreator ic = GATKVCFUtils.getIndexCreator(spec.type, spec.parameter, dummy); + Assert.assertEquals(ic.getClass(), spec.expectedClass, "Wrong IndexCreator type"); + if (spec.expectedDimension != null) { + Integer dimension = (int)spec.dimensionGetter.invoke(ic); + Assert.assertEquals(dimension, spec.expectedDimension, "Wrong dimension"); + } + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngineUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngineUnitTest.java new file mode 100644 index 000000000..424083a11 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngineUnitTest.java @@ -0,0 +1,272 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import org.broadinstitute.gatk.engine.walkers.TestCountReadsWalker; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.GenomeLocSortedSet; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.*; + +/** + * Tests selected functionality in the GenomeAnalysisEngine class + */ +public class GenomeAnalysisEngineUnitTest extends BaseTest { + + @Test(expectedExceptions=UserException.class) + public void testEmptySamFileListHandling() throws Exception { + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + testEngine.setWalker(new TestCountReadsWalker()); //generalizable to any walker requiring reads + + //supply command line args so validateSuppliedReads() knows whether reads were passed in + GATKArgumentCollection testArgs = new GATKArgumentCollection(); + testArgs.samFiles.add("empty.list"); + testEngine.setArguments(testArgs); + + //represents the empty list of samFiles read in from empty.list by CommandLineExecutable + Collection samFiles = new ArrayList(); + + testEngine.setSAMFileIDs(samFiles); + testEngine.validateSuppliedReads(); + } + + @Test(expectedExceptions=UserException.class) + public void testDuplicateSamFileHandlingSingleDuplicate() throws Exception { + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + + Collection samFiles = new ArrayList(); + samFiles.add(new SAMReaderID(new File(publicTestDir + "exampleBAM.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File(publicTestDir + "exampleBAM.bam"), new Tags())); + + testEngine.setSAMFileIDs(samFiles); + testEngine.checkForDuplicateSamFiles(); + } + + @Test(expectedExceptions=UserException.class) + public void testDuplicateSamFileHandlingMultipleDuplicates() throws Exception { + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + + Collection samFiles = new ArrayList(); + samFiles.add(new SAMReaderID(new File(publicTestDir + "exampleBAM.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File(publicTestDir + "exampleNORG.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File(publicTestDir + "exampleBAM.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File(publicTestDir + "exampleNORG.bam"), new Tags())); + + testEngine.setSAMFileIDs(samFiles); + testEngine.checkForDuplicateSamFiles(); + } + + @Test(expectedExceptions=UserException.class) + public void testDuplicateSamFileHandlingAbsoluteVsRelativePath() { + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + + final File relativePathToBAMFile = new File(publicTestDir + "exampleBAM.bam"); + final File absolutePathToBAMFile = new File(relativePathToBAMFile.getAbsolutePath()); + Collection samFiles = new ArrayList(); + samFiles.add(new SAMReaderID(relativePathToBAMFile, new Tags())); + samFiles.add(new SAMReaderID(absolutePathToBAMFile, new Tags())); + + testEngine.setSAMFileIDs(samFiles); + testEngine.checkForDuplicateSamFiles(); + } + + @Test + public void testEmptyIntervalSetHandling() throws Exception { + GenomeLocParser genomeLocParser = new GenomeLocParser(ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000).getSequenceDictionary()); + + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + + testEngine.setWalker(new TestCountReadsWalker()); + testEngine.setIntervals(new GenomeLocSortedSet(genomeLocParser)); + + testEngine.validateSuppliedIntervals(); + } + + @Test + public void testLoadWellFormedSampleRenameMapFile() throws IOException { + final File mapFile = createTestSampleRenameMapFile(Arrays.asList("/foo/bar/first.bam newSample1", + "/foo/bar/second.bam newSample2", + "/foo/bar2/third.bam newSample3", + "/foo/bar2/fourth.bam new sample 4", + "/foo/bar2/fifth.bam new sample 5 ")); + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + final Map renameMap = engine.loadSampleRenameMap(mapFile); + + Assert.assertEquals(renameMap.size(), 5, "Sample rename map was wrong size after loading from file"); + + final Iterator expectedResultsIterator = Arrays.asList( + "/foo/bar/first.bam", "newSample1", + "/foo/bar/second.bam", "newSample2", + "/foo/bar2/third.bam", "newSample3", + "/foo/bar2/fourth.bam", "new sample 4", + "/foo/bar2/fifth.bam", "new sample 5" + ).iterator(); + while ( expectedResultsIterator.hasNext() ) { + final String expectedKey = expectedResultsIterator.next(); + final String expectedValue = expectedResultsIterator.next(); + + Assert.assertNotNull(renameMap.get(expectedKey), String.format("Entry for %s not found in sample rename map", expectedKey)); + Assert.assertEquals(renameMap.get(expectedKey), expectedValue, "Wrong value in sample rename map for " + expectedKey); + } + } + + @DataProvider(name = "MalformedSampleRenameMapFileDataProvider") + public Object[][] generateMalformedSampleRenameMapFiles() throws IOException { + final List tests = new ArrayList(); + + tests.add(new Object[]{"testLoadSampleRenameMapFileNonExistentFile", + new File("/foo/bar/nonexistent")}); + tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine", + createTestSampleRenameMapFile(Arrays.asList("/path/to/foo.bam"))}); + tests.add(new Object[]{"testLoadSampleRenameMapFileNonAbsoluteBamPath", + createTestSampleRenameMapFile(Arrays.asList("relative/path/to/foo.bam newSample"))}); + tests.add(new Object[]{"testLoadSampleRenameMapFileDuplicateBamPath", + createTestSampleRenameMapFile(Arrays.asList("/path/to/dupe.bam newSample1", + "/path/to/dupe.bam newSample2"))}); + tests.add(new Object[]{"testLoadSampleRenameMapFileTabInSampleName", + createTestSampleRenameMapFile(Arrays.asList("/path/to/stuff.bam some wonky\tsample "))}); + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MalformedSampleRenameMapFileDataProvider", expectedExceptions = UserException.class) + public void testLoadMalformedSampleRenameMapFile( final String testName, final File mapFile ) { + logger.info("Executing test " + testName); + + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + final Map renameMap = engine.loadSampleRenameMap(mapFile); + } + + private File createTestSampleRenameMapFile( final List contents ) throws IOException { + final File mapFile = createTempFile("TestSampleRenameMapFile", ".tmp"); + final PrintWriter writer = new PrintWriter(mapFile); + + for ( final String line : contents ) { + writer.println(line); + } + writer.close(); + + return mapFile; + } + + /////////////////////////////////////////////////// + // Test the ReadTransformer ordering enforcement // + /////////////////////////////////////////////////// + + public static class TestReadTransformer extends ReadTransformer { + + private OrderingConstraint orderingConstraint = OrderingConstraint.DO_NOT_CARE; + private boolean enabled; + + protected TestReadTransformer(final OrderingConstraint orderingConstraint) { + this.orderingConstraint = orderingConstraint; + enabled = true; + } + + // need this because PackageUtils will pick up this class as a possible ReadTransformer + protected TestReadTransformer() { + enabled = false; + } + + @Override + public OrderingConstraint getOrderingConstraint() { return orderingConstraint; } + + @Override + public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { return ApplicationTime.HANDLED_IN_WALKER; } + + @Override + public boolean enabled() { return enabled; } + + @Override + public GATKSAMRecord apply(final GATKSAMRecord read) { return read; } + + } + + @DataProvider(name = "ReadTransformerData") + public Object[][] makeReadTransformerData() { + List tests = new ArrayList(); + + for ( final ReadTransformer.OrderingConstraint orderingConstraint1 : ReadTransformer.OrderingConstraint.values() ) { + for ( final ReadTransformer.OrderingConstraint orderingConstraint2 : ReadTransformer.OrderingConstraint.values() ) { + for ( final ReadTransformer.OrderingConstraint orderingConstraint3 : ReadTransformer.OrderingConstraint.values() ) { + tests.add(new Object[]{orderingConstraint1, orderingConstraint2, orderingConstraint3}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ReadTransformerData") + public void testReadTransformer(final ReadTransformer.OrderingConstraint oc1, final ReadTransformer.OrderingConstraint oc2, final ReadTransformer.OrderingConstraint oc3) { + + final GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + final List readTransformers = new ArrayList(3); + readTransformers.add(new TestReadTransformer(oc1)); + readTransformers.add(new TestReadTransformer(oc2)); + readTransformers.add(new TestReadTransformer(oc3)); + + final boolean shouldThrowException = numWithConstraint(ReadTransformer.OrderingConstraint.MUST_BE_FIRST, oc1, oc2, oc3) > 1 || + numWithConstraint(ReadTransformer.OrderingConstraint.MUST_BE_LAST, oc1, oc2, oc3) > 1; + + try { + testEngine.setReadTransformers(readTransformers); + + Assert.assertFalse(shouldThrowException); + Assert.assertEquals(testEngine.getReadTransformers().size(), 3); + + Assert.assertTrue(testEngine.getReadTransformers().get(1).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_FIRST); + Assert.assertTrue(testEngine.getReadTransformers().get(2).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_FIRST); + Assert.assertTrue(testEngine.getReadTransformers().get(0).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_LAST); + Assert.assertTrue(testEngine.getReadTransformers().get(1).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_LAST); + } catch (UserException.IncompatibleReadFiltersException e) { + Assert.assertTrue(shouldThrowException); + } + } + + private int numWithConstraint(final ReadTransformer.OrderingConstraint target, final ReadTransformer.OrderingConstraint... constraints ) { + int count = 0; + for ( final ReadTransformer.OrderingConstraint constraint : constraints ) { + if ( constraint == target ) + count++; + } + return count; + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/InstantiableWalker.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/InstantiableWalker.java new file mode 100644 index 000000000..4c6e35d0c --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/InstantiableWalker.java @@ -0,0 +1,37 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.commandline.Hidden; + +@Hidden +public class InstantiableWalker extends Walker { + // Public constructor will generate instantiable message + public InstantiableWalker() {} + public Long reduceInit() { return 0L; } + public Long reduce(Integer value, Long accum) { return 0L; } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/MaxRuntimeIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/MaxRuntimeIntegrationTest.java new file mode 100644 index 000000000..2d48487e4 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/MaxRuntimeIntegrationTest.java @@ -0,0 +1,151 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.utils.commandline.Output; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.engine.walkers.LocusWalker; +import org.broadinstitute.gatk.utils.SimpleTimer; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.TimeUnit; + +/** + * + */ +public class MaxRuntimeIntegrationTest extends WalkerTest { + public static class SleepingWalker extends LocusWalker { + @Output PrintStream out; + + @Argument(fullName="sleepTime",shortName="sleepTime",doc="x", required=false) + public int sleepTime = 100; + + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + try {Thread.sleep(sleepTime);} catch (InterruptedException e) {}; + return 1; + } + + @Override public Integer reduceInit() { return 0; } + @Override public Integer reduce(Integer value, Integer sum) { return sum + value; } + + @Override + public void onTraversalDone(Integer result) { + out.println(result); + } + } + + private static final long STARTUP_TIME = TimeUnit.NANOSECONDS.convert(60, TimeUnit.SECONDS); + + private class MaxRuntimeTestProvider extends TestDataProvider { + final long maxRuntime; + final TimeUnit unit; + + public MaxRuntimeTestProvider(final long maxRuntime, final TimeUnit unit) { + super(MaxRuntimeTestProvider.class); + this.maxRuntime = maxRuntime; + this.unit = unit; + setName(String.format("Max runtime test : %d of %s", maxRuntime, unit)); + } + + public long expectedMaxRuntimeNano() { + return TimeUnit.NANOSECONDS.convert(maxRuntime, unit) + STARTUP_TIME; + } + } + + @DataProvider(name = "MaxRuntimeProvider") + public Object[][] makeMaxRuntimeProvider() { + for ( final TimeUnit requestedUnits : Arrays.asList(TimeUnit.NANOSECONDS, TimeUnit.MILLISECONDS, TimeUnit.SECONDS, TimeUnit.MINUTES) ) + new MaxRuntimeTestProvider(requestedUnits.convert(30, TimeUnit.SECONDS), requestedUnits); + + return MaxRuntimeTestProvider.getTests(MaxRuntimeTestProvider.class); + } + + // + // Loop over errors to throw, make sure they are the errors we get back from the engine, regardless of NT type + // + @Test(enabled = true, dataProvider = "MaxRuntimeProvider", timeOut = 120 * 1000) + public void testMaxRuntime(final MaxRuntimeTestProvider cfg) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestPrintReadsWalker -R " + hg18Reference + + " -I " + validationDataLocation + "NA12878.WEx.downsampled20x.bam -o /dev/null" + + " -maxRuntime " + cfg.maxRuntime + " -maxRuntimeUnits " + cfg.unit, 0, + Collections.emptyList()); + final SimpleTimer timer = new SimpleTimer().start(); + executeTest("Max runtime " + cfg, spec); + final long actualRuntimeNano = timer.getElapsedTimeNano(); + + Assert.assertTrue(actualRuntimeNano < cfg.expectedMaxRuntimeNano(), + "Actual runtime " + TimeUnit.SECONDS.convert(actualRuntimeNano, TimeUnit.NANOSECONDS) + + " exceeded max. tolerated runtime " + TimeUnit.SECONDS.convert(cfg.expectedMaxRuntimeNano(), TimeUnit.NANOSECONDS) + + " given requested runtime " + cfg.maxRuntime + " " + cfg.unit); + } + + @DataProvider(name = "SubshardProvider") + public Object[][] makeSubshardProvider() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{10}); + tests.add(new Object[]{100}); + tests.add(new Object[]{500}); + tests.add(new Object[]{1000}); + tests.add(new Object[]{2000}); + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "SubshardProvider", timeOut = 120 * 1000) + public void testSubshardTimeout(final int sleepTime) throws Exception { + final int maxRuntime = 5000; + + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T SleepingWalker -R " + b37KGReference + + " -I " + privateTestDir + "NA12878.100kb.BQSRv2.example.bam -o %s" + + " -maxRuntime " + maxRuntime + " -maxRuntimeUnits MILLISECONDS -sleepTime " + sleepTime, 1, + Collections.singletonList("")); + final File result = executeTest("Subshard max runtime ", spec).getFirst().get(0); + final int cycle = Integer.valueOf(new BufferedReader(new FileReader(result)).readLine()); + + final int maxCycles = (int)Math.ceil((maxRuntime * 5) / sleepTime); + logger.warn(String.format("Max cycles %d saw %d in file %s with sleepTime %d and maxRuntime %d", maxCycles, cycle, result, sleepTime, maxRuntime)); + Assert.assertTrue(cycle < maxCycles, "Too many cycles seen -- saw " + cycle + " in file " + result + " but max should have been " + maxCycles); + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/ReadMetricsUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/ReadMetricsUnitTest.java new file mode 100644 index 000000000..50c7f8222 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/ReadMetricsUnitTest.java @@ -0,0 +1,372 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.*; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.engine.datasources.providers.LocusShardDataProvider; +import org.broadinstitute.gatk.engine.datasources.providers.ReadShardDataProvider; +import org.broadinstitute.gatk.engine.datasources.reads.*; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.engine.executive.WindowMaker; +import org.broadinstitute.gatk.engine.filters.ReadFilter; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.engine.traversals.*; +import org.broadinstitute.gatk.engine.walkers.*; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.GenomeLocSortedSet; +import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; +import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; +import org.broadinstitute.gatk.utils.sam.*; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.util.*; + +public class ReadMetricsUnitTest extends BaseTest { + + @Test + public void testReadsSeenDoNotOverflowInt() { + + final ReadMetrics metrics = new ReadMetrics(); + + final long moreThanMaxInt = ((long)Integer.MAX_VALUE) + 1L; + + for ( long i = 0L; i < moreThanMaxInt; i++ ) { + metrics.incrementNumReadsSeen(); + } + + Assert.assertEquals(metrics.getNumReadsSeen(), moreThanMaxInt); + Assert.assertTrue(metrics.getNumReadsSeen() > (long) Integer.MAX_VALUE); + + logger.warn(String.format("%d %d %d", Integer.MAX_VALUE, moreThanMaxInt, Long.MAX_VALUE)); + } + + + // Test the accuracy of the read metrics + + private File referenceFile; + private IndexedFastaSequenceFile reference; + private SAMSequenceDictionary dictionary; + private SAMFileHeader header; + private GATKSAMReadGroupRecord readGroup; + private GenomeLocParser genomeLocParser; + private File testBAM; + + private static final int numReadsPerContig = 250000; + private static final List contigs = Arrays.asList("1", "2", "3"); + + @BeforeClass + private void init() throws IOException { + referenceFile = new File(b37KGReference); + reference = new CachingIndexedFastaSequenceFile(referenceFile); + dictionary = reference.getSequenceDictionary(); + genomeLocParser = new GenomeLocParser(dictionary); + header = ArtificialSAMUtils.createDefaultReadGroup(new SAMFileHeader(), "test", "test"); + header.setSequenceDictionary(dictionary); + header.setSortOrder(SAMFileHeader.SortOrder.coordinate); + readGroup = new GATKSAMReadGroupRecord(header.getReadGroup("test")); + + final List reads = new ArrayList<>(); + for ( final String contig : contigs ) { + for ( int i = 1; i <= numReadsPerContig; i++ ) { + reads.add(buildSAMRecord("read" + contig + "_" + i, contig, i)); + } + } + + createBAM(reads); + } + + private void createBAM(final List reads) throws IOException { + testBAM = createTempFile("TraverseActiveRegionsUnitTest", ".bam"); + + SAMFileWriter out = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(reads.get(0).getHeader(), true, testBAM); + for (GATKSAMRecord read : reads ) { + out.addAlignment(read); + } + out.close(); + + new File(testBAM.getAbsolutePath().replace(".bam", ".bai")).deleteOnExit(); + new File(testBAM.getAbsolutePath() + ".bai").deleteOnExit(); + } + + // copied from LocusViewTemplate + protected GATKSAMRecord buildSAMRecord(final String readName, final String contig, final int alignmentStart) { + GATKSAMRecord record = new GATKSAMRecord(header); + + record.setReadName(readName); + record.setReferenceIndex(dictionary.getSequenceIndex(contig)); + record.setAlignmentStart(alignmentStart); + + record.setCigarString("1M"); + record.setReadString("A"); + record.setBaseQualityString("A"); + record.setReadGroup(readGroup); + + return record; + } + + @Test + public void testCountsFromReadTraversal() { + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + engine.setGenomeLocParser(genomeLocParser); + + final Collection samFiles = new ArrayList<>(); + final SAMReaderID readerID = new SAMReaderID(testBAM, new Tags()); + samFiles.add(readerID); + + final SAMDataSource dataSource = new SAMDataSource(referenceFile, samFiles, new ThreadAllocation(), null, genomeLocParser, + false, + ValidationStringency.STRICT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + new ArrayList(), + false, (byte)30, false, true, null, IntervalMergingRule.ALL); + + engine.setReadsDataSource(dataSource); + + final TraverseReadsNano traverseReadsNano = new TraverseReadsNano(1); + final DummyReadWalker walker = new DummyReadWalker(); + traverseReadsNano.initialize(engine, walker, null); + + for ( final Shard shard : dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()) ) { + final ReadShardDataProvider dataProvider = new ReadShardDataProvider(shard, engine.getGenomeLocParser(), dataSource.seek(shard), reference, new ArrayList()); + traverseReadsNano.traverse(walker, dataProvider, 0); + dataProvider.close(); + } + + Assert.assertEquals(engine.getCumulativeMetrics().getNumReadsSeen(), contigs.size() * numReadsPerContig); + Assert.assertEquals(engine.getCumulativeMetrics().getNumIterations(), contigs.size() * numReadsPerContig); + } + + @Test + public void testCountsFromLocusTraversal() { + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + engine.setGenomeLocParser(genomeLocParser); + + final Collection samFiles = new ArrayList<>(); + final SAMReaderID readerID = new SAMReaderID(testBAM, new Tags()); + samFiles.add(readerID); + + final SAMDataSource dataSource = new SAMDataSource(referenceFile, samFiles, new ThreadAllocation(), null, genomeLocParser, + false, + ValidationStringency.STRICT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + new ArrayList(), + false, (byte)30, false, true, null, IntervalMergingRule.ALL); + + engine.setReadsDataSource(dataSource); + final Set samples = ReadUtils.getSAMFileSamples(dataSource.getHeader()); + + final TraverseLociNano traverseLociNano = new TraverseLociNano(1); + final DummyLocusWalker walker = new DummyLocusWalker(); + traverseLociNano.initialize(engine, walker, null); + + for ( final Shard shard : dataSource.createShardIteratorOverAllReads(new LocusShardBalancer()) ) { + final WindowMaker windowMaker = new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs(), samples); + for ( WindowMaker.WindowMakerIterator window : windowMaker ) { + final LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList()); + traverseLociNano.traverse(walker, dataProvider, 0); + dataProvider.close(); + } + windowMaker.close(); + } + + //dataSource.close(); + Assert.assertEquals(engine.getCumulativeMetrics().getNumReadsSeen(), contigs.size() * numReadsPerContig); + Assert.assertEquals(engine.getCumulativeMetrics().getNumIterations(), contigs.size() * numReadsPerContig); + } + + @Test + public void testCountsFromActiveRegionTraversal() { + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + engine.setGenomeLocParser(genomeLocParser); + + final Collection samFiles = new ArrayList<>(); + final SAMReaderID readerID = new SAMReaderID(testBAM, new Tags()); + samFiles.add(readerID); + + final SAMDataSource dataSource = new SAMDataSource(referenceFile, samFiles, new ThreadAllocation(), null, genomeLocParser, + false, + ValidationStringency.STRICT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + new ArrayList(), + false, (byte)30, false, true, null, IntervalMergingRule.ALL); + + engine.setReadsDataSource(dataSource); + final Set samples = ReadUtils.getSAMFileSamples(dataSource.getHeader()); + + final List intervals = new ArrayList<>(contigs.size()); + for ( final String contig : contigs ) + intervals.add(genomeLocParser.createGenomeLoc(contig, 1, numReadsPerContig)); + + final TraverseActiveRegions traverseActiveRegions = new TraverseActiveRegions(); + final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); + traverseActiveRegions.initialize(engine, walker, null); + + for ( final Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new ActiveRegionShardBalancer()) ) { + final WindowMaker windowMaker = new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs(), samples); + for ( WindowMaker.WindowMakerIterator window : windowMaker ) { + final LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList()); + traverseActiveRegions.traverse(walker, dataProvider, 0); + dataProvider.close(); + } + windowMaker.close(); + } + + Assert.assertEquals(engine.getCumulativeMetrics().getNumReadsSeen(), contigs.size() * numReadsPerContig); + Assert.assertEquals(engine.getCumulativeMetrics().getNumIterations(), contigs.size() * numReadsPerContig); + } + + @Test + public void testFilteredCounts() { + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + engine.setGenomeLocParser(genomeLocParser); + + final Collection samFiles = new ArrayList<>(); + final SAMReaderID readerID = new SAMReaderID(testBAM, new Tags()); + samFiles.add(readerID); + + final List filters = new ArrayList<>(); + filters.add(new EveryTenthReadFilter()); + + final SAMDataSource dataSource = new SAMDataSource(referenceFile, samFiles, new ThreadAllocation(), null, genomeLocParser, + false, + ValidationStringency.STRICT, + null, + null, + new ValidationExclusion(), + filters, + new ArrayList(), + false, (byte)30, false, true, null, IntervalMergingRule.ALL); + + engine.setReadsDataSource(dataSource); + + final TraverseReadsNano traverseReadsNano = new TraverseReadsNano(1); + final DummyReadWalker walker = new DummyReadWalker(); + traverseReadsNano.initialize(engine, walker, null); + + for ( final Shard shard : dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()) ) { + final ReadShardDataProvider dataProvider = new ReadShardDataProvider(shard, engine.getGenomeLocParser(), dataSource.seek(shard), reference, new ArrayList()); + traverseReadsNano.traverse(walker, dataProvider, 0); + dataProvider.close(); + } + + Assert.assertEquals((long)engine.getCumulativeMetrics().getCountsByFilter().get(EveryTenthReadFilter.class.getSimpleName()), contigs.size() * numReadsPerContig / 10); + } + + class DummyLocusWalker extends LocusWalker { + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; + } + } + + class DummyReadWalker extends ReadWalker { + @Override + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; + } + } + + class DummyActiveRegionWalker extends ActiveRegionWalker { + @Override + public ActivityProfileState isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return new ActivityProfileState(ref.getLocus(), 0.0); + } + + @Override + public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; + } + } + + private final class EveryTenthReadFilter extends ReadFilter { + + private int myCounter = 0; + + @Override + public boolean filterOut(final SAMRecord record) { + if ( ++myCounter == 10 ) { + myCounter = 0; + return true; + } + + return false; + } + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/SampleUtilsUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/SampleUtilsUnitTest.java new file mode 100644 index 000000000..7de5f0dbf --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/SampleUtilsUnitTest.java @@ -0,0 +1,49 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import org.broadinstitute.gatk.engine.SampleUtils; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +/** + * Testing framework for sample utilities class. + * + * @author gauthier + */ + +public class SampleUtilsUnitTest extends BaseTest { + @Test(expectedExceptions=UserException.class) + public void testBadSampleFiles() throws Exception { + Set sampleFiles = new HashSet(0); + sampleFiles.add(new File("fileNotHere.samples")); + Collection samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/UninstantiableWalker.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/UninstantiableWalker.java new file mode 100644 index 000000000..11a3c3d6d --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/UninstantiableWalker.java @@ -0,0 +1,37 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.commandline.Hidden; + +@Hidden +public class UninstantiableWalker extends Walker { + // Private constructor will generate uninstantiable message + private UninstantiableWalker() {} + public Long reduceInit() { return 0L; } + public Long reduce(Integer value, Long accum) { return 0L; } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/WalkerManagerUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/WalkerManagerUnitTest.java new file mode 100644 index 000000000..0a940ef22 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/WalkerManagerUnitTest.java @@ -0,0 +1,61 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.exceptions.DynamicClassResolutionException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +/** + * Tests basic functionality of the walker manager. + */ +public class WalkerManagerUnitTest { + private static WalkerManager walkerManager; + + @BeforeClass + public void setUp() { + walkerManager = new WalkerManager(); + } + + @Test + public void testPresentWalker() { + Walker instantiableWalker = walkerManager.createByName("InstantiableWalker"); + Assert.assertEquals(InstantiableWalker.class, instantiableWalker.getClass()); + } + + @Test(expectedExceptions=UserException.class) + public void testAbsentWalker() { + walkerManager.createByName("Missing"); + } + + @Test(expectedExceptions=DynamicClassResolutionException.class) + public void testUninstantiableWalker() { + walkerManager.createByName("UninstantiableWalker"); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/CramIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/CramIntegrationTest.java new file mode 100644 index 000000000..770d951b0 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/CramIntegrationTest.java @@ -0,0 +1,74 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.arguments; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; + +/** + * Test the GATK core CRAM parsing mechanism. + */ +public class CramIntegrationTest extends WalkerTest { + @DataProvider(name="cramData") + public Object[][] getCRAMData() { + return new Object[][] { + {"PrintReads", "exampleBAM.bam", "", "cram", "026ebc00c2a8f9832e37f1a6a0f53521"}, + //{"PrintReads", "exampleCRAM.cram", "", "cram", "026ebc00c2a8f9832e37f1a6a0f53521"}, https://github.com/samtools/htsjdk/issues/148 + {"PrintReads", "exampleCRAM.cram", "", "bam", "99e5f740b43594a5b8e5bc1a007719e0"}, + {"PrintReads", "exampleCRAM-noindex.cram", "", "bam", "99e5f740b43594a5b8e5bc1a007719e0"}, + {"PrintReads", "exampleCRAM.cram", " -L chr1:200", "bam", "072435e8272411c31b2234f851706384"}, + {"PrintReads", "exampleCRAM-noindex.cram", " -L chr1:200", "bam", "072435e8272411c31b2234f851706384"}, + {"CountLoci", "exampleCRAM.cram", "", "txt", "ade93df31a6150321c1067e749cae9be"}, + {"CountLoci", "exampleCRAM-noindex.cram", "", "txt", "ade93df31a6150321c1067e749cae9be"}, + {"CountLoci", "exampleCRAM.cram", " -L chr1:200", "txt", "b026324c6904b2a9cb4b88d6d61c81d1"}, + {"CountLoci", "exampleCRAM-noindex.cram", " -L chr1:200", "txt", "b026324c6904b2a9cb4b88d6d61c81d1"}, + {"CountReads", "exampleCRAM.cram", "", "txt", "4fbafd6948b6529caa2b78e476359875"}, + {"CountReads", "exampleCRAM-noindex.cram", "", "txt", "4fbafd6948b6529caa2b78e476359875"}, + {"CountReads", "exampleCRAM.cram", " -L chr1:200", "txt", "b026324c6904b2a9cb4b88d6d61c81d1"}, + {"CountReads", "exampleCRAM-noindex.cram", " -L chr1:200", "txt", "b026324c6904b2a9cb4b88d6d61c81d1"}, + {"PrintReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "bam", "9598062587ad8d2ec596a8ecb19be979"}, + {"CountLoci", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "txt", "26ab0db90d72e28ad0ba1e22ee510510"}, + {"CountReads", "exampleCRAM.cram", " -L chr1:200 -L chr1:89597", "txt", "6d7fce9fee471194aa8b5b6e47267f03"}, + }; + } + + @Test(dataProvider = "cramData") + public void testCRAM(String walker, String input, String args, String ext, String md5) { + WalkerTestSpec spec = new WalkerTestSpec( + " -T Test" + walker + "Walker" + + " -I " + publicTestDir + input + + " -R " + exampleFASTA + + args + + " -o %s", + 1, // just one output file + Arrays.asList(ext), + Arrays.asList(md5)); + executeTest(String.format("testCRAM %s %s -> %s: %s", walker, input, ext, args), spec); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/IntervalIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/IntervalIntegrationTest.java new file mode 100644 index 000000000..1229ecfff --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/IntervalIntegrationTest.java @@ -0,0 +1,304 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.arguments; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.Collections; + +/** + * Test the GATK core interval parsing mechanism. + */ +public class IntervalIntegrationTest extends WalkerTest { + @Test(enabled = true) + public void testAllImplicitIntervalParsing() { + String md5 = "7821db9e14d4f8e07029ff1959cd5a99"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testAllIntervalsImplicit",spec); + } + +// '-L all' is no longer supported +// @Test(enabled = true) +// public void testAllExplicitIntervalParsing() { +// String md5 = "7821db9e14d4f8e07029ff1959cd5a99"; +// WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( +// "-T TestCountLociWalker" + +// " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + +// " -R " + hg18Reference + +// " -L all" + +// " -o %s", +// 1, // just one output file +// Arrays.asList(md5)); +// executeTest("testAllIntervalsExplicit",spec); +// } + + @Test + public void testUnmappedReadInclusion() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestPrintReadsWalker" + + " -I " + validationDataLocation + "MV1994.bam" + + " -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" + + " -L unmapped" + + " -U", + 0, // two output files + Collections.emptyList()); + + // our base file + File baseOutputFile = createTempFile("testUnmappedReadInclusion",".bam"); + spec.setOutputFileLocation(baseOutputFile); + spec.addAuxFile("95e98192e5b90cf80eaa87a4ace263da",createTempFileFromBase(baseOutputFile.getAbsolutePath())); + spec.addAuxFile("fadcdf88597b9609c5f2a17f4c6eb455", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); + + executeTest("testUnmappedReadInclusion",spec); + } + + @Test + public void testMixedMappedAndUnmapped() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestPrintReadsWalker" + + " -I " + validationDataLocation + "MV1994.bam" + + " -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" + + " -L Escherichia_coli_K12:4630000-4639675" + + " -L unmapped" + + " -U", + 0, // two output files + Collections.emptyList()); + + // our base file + File baseOutputFile = createTempFile("testUnmappedReadInclusion",".bam"); + spec.setOutputFileLocation(baseOutputFile); + spec.addAuxFile("3944b5a6bfc06277ed3afb928a20d588",createTempFileFromBase(baseOutputFile.getAbsolutePath())); + spec.addAuxFile("fa90ff91ac0cc689c71a3460a3530b8b", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); + + executeTest("testUnmappedReadInclusion",spec); + } + + + @Test(enabled = false) + public void testUnmappedReadExclusion() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestPrintReadsWalker" + + " -I " + validationDataLocation + "MV1994.bam" + + " -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" + + " -XL unmapped" + + " -U", + 0, // two output files + Collections.emptyList()); + + // our base file + File baseOutputFile = createTempFile("testUnmappedReadExclusion",".bam"); + spec.setOutputFileLocation(baseOutputFile); + spec.addAuxFile("80887ba488e53dabd9596ff93070ae75",createTempFileFromBase(baseOutputFile.getAbsolutePath())); + spec.addAuxFile("b341d808ecc33217f37c0c0cde2a3e2f", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); + + executeTest("testUnmappedReadExclusion",spec); + } + + @Test(enabled = true) + public void testIntervalParsingFromFile() { + String md5 = "48a24b70a0b376535542b996af517398"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.1.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testIntervalParsingFromFile", spec); + } + + @Test(enabled = true) + public void testIntervalMergingFromFiles() { + String md5 = "9ae0ea9e3c9c6e1b9b6252c8395efdc1"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.1.vcf" + + " -L " + validationDataLocation + "intervalTest.2.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testIntervalMergingFromFiles", spec); + } + + @Test(enabled = true) + public void testIntervalExclusionsFromFiles() { + String md5 = "26ab0db90d72e28ad0ba1e22ee510510"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.1.vcf" + + " -XL " + validationDataLocation + "intervalTest.2.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testIntervalExclusionsFromFiles", spec); + } + + @Test(enabled = true) + public void testMixedIntervalMerging() { + String md5 = "7c5aba41f53293b712fd86d08ed5b36e"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.1.vcf" + + " -L chr1:1677524-1677528", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testMixedIntervalMerging", spec); + } + + @Test(enabled = true) + public void testBed() { + String md5 = "cf4278314ef8e4b996e1b798d8eb92cf"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.bed", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testBed", spec); + } + + @Test(enabled = true) + public void testComplexVCF() { + String md5 = "166d77ac1b46a1ec38aa35ab7e628ab5"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.3.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testComplexVCF", spec); + } + + @Test(enabled = true) + public void testComplexVCFWithPadding() { + String md5 = "649ee93d50739c656e94ec88a32c7ffe"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " --interval_padding 2" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.3.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testComplexVCFWithPadding", spec); + } + + @Test(enabled = true) + public void testMergingWithComplexVCF() { + String md5 = "6d7fce9fee471194aa8b5b6e47267f03"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.1.vcf" + + " -XL " + validationDataLocation + "intervalTest.3.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testMergingWithComplexVCF", spec); + } + + @Test(enabled = true) + public void testEmptyVCF() { + String md5 = "897316929176464ebc9ad085f31e7284"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.empty.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testEmptyVCFWarning", spec); + } + + @Test(enabled = true) + public void testIncludeExcludeIsTheSame() { + String md5 = "897316929176464ebc9ad085f31e7284"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.1.vcf" + + " -XL " + validationDataLocation + "intervalTest.1.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testIncludeExcludeIsTheSame", spec); + } + + @Test(enabled = true) + public void testSymbolicAlleles() { + String md5 = "52745056d2fd5904857bbd4984c08098"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam" + + " -R " + b36KGReference + + " -o %s" + + " -L " + privateTestDir + "symbolic_alleles_1.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testSymbolicAlleles", spec); + } + + @Test + public void testIntersectionOfLexicographicallySortedIntervals() { + final String md5 = "18be9375e5a753f766616a51eb6131f0"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + " -T TestCountLociWalker" + + " -I " + privateTestDir + "NA12878.4.snippet.bam" + + " -R " + b37KGReference + + " -L " + privateTestDir + "lexicographicallySortedIntervals.bed" + + " -L 4" + + " -isr INTERSECTION" + + " -o %s", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testIntersectionOfLexicographicallySortedIntervals", spec); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/InvalidArgumentIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/InvalidArgumentIntegrationTest.java new file mode 100644 index 000000000..ca9682747 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/InvalidArgumentIntegrationTest.java @@ -0,0 +1,55 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.arguments; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.annotations.Test; + +public class InvalidArgumentIntegrationTest extends WalkerTest { + @Test + public void testUnknownReadFilter() { + executeTest("UnknownReadFilter", + new WalkerTest.WalkerTestSpec( + " -T TestPrintReadsWalker" + + " -R " + exampleFASTA + + " -I " + publicTestDir + "exampleBAM.bam" + + " -o %s" + + " -rf TestUnknownReadFilter", + 1, UserException.MalformedReadFilterException.class)); + } + + @Test + public void testMalformedWalkerArgs() { + executeTest("MalformedWalkerArgs", + new WalkerTest.WalkerTestSpec( + " -T UnknownWalkerName" + + " -R " + exampleFASTA + + " -I " + publicTestDir + "exampleBAM.bam" + + " -o %s", + 1, UserException.MalformedWalkerArgumentsException.class)); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/LoggingIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/LoggingIntegrationTest.java new file mode 100644 index 000000000..73c177688 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/LoggingIntegrationTest.java @@ -0,0 +1,117 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.arguments; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; + +import org.apache.commons.lang.StringUtils; +import org.apache.log4j.Level; + +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.MD5DB; +import org.broadinstitute.gatk.utils.MD5Mismatch; +import org.broadinstitute.gatk.engine.CommandLineGATK; +import org.broadinstitute.gatk.utils.runtime.*; + +public class LoggingIntegrationTest { + private final MD5DB md5db = new MD5DB(); + + private class LoggingTestProvider extends BaseTest.TestDataProvider { + + private final String baseCmdLine; + + private final Level logLevel; + private final String logFileStr; + public final File argumentOutputFile; + public final File pipedOutputFile; + + private LoggingTestProvider(final Level logLevel, final boolean explicitLogfile) throws IOException { + super(LoggingTestProvider.class); + + // TODO: a better command line that exercises log levels besides INFO + this.baseCmdLine = String.format("java -cp %s %s -T TestPrintVariantsWalker -R %s -V %s -L 1:1000000-2000000 --no_cmdline_in_header", + StringUtils.join(RuntimeUtils.getAbsoluteClassPaths(), File.pathSeparatorChar), + CommandLineGATK.class.getCanonicalName(), BaseTest.b37KGReference, BaseTest.b37_NA12878_OMNI); + + this.logLevel = logLevel; + this.logFileStr = explicitLogfile ? " -log " + BaseTest.createTempFile(logLevel.toString(), "log") : ""; + this.argumentOutputFile = BaseTest.createTempFile(logLevel.toString(), "vcf"); + this.pipedOutputFile = BaseTest.createTempFile(logLevel.toString(), "vcf"); + } + + public final String getCmdLine(boolean redirectStdout) { + String command = String.format("%s -l %s %s", baseCmdLine, logLevel, logFileStr); + return redirectStdout ? command : command + " -o " + argumentOutputFile; + } + + public String toString() { + return String.format("LoggingTestProvider logLevel=%s", logLevel); + } + } + + @DataProvider(name = "LoggingTest") + public Object[][] makeLoggingTestProvider() throws IOException { + for (Boolean explicitLogFile : Arrays.asList(true, false)) { + // TODO: enable other logging levels when tests for those exist + new LoggingTestProvider(Level.DEBUG, explicitLogFile); + } + + return LoggingTestProvider.getTests(LoggingTestProvider.class); + } + + /** + * test that using an output argument produces the same output as stdout + */ + @Test(dataProvider = "LoggingTest") + public void testStdoutEquivalence(final LoggingTestProvider cfg) throws IOException { + + ProcessController pc = ProcessController.getThreadLocal(); + + // output argument + + ProcessSettings ps = new ProcessSettings(cfg.getCmdLine(false).split("\\s+")); + pc.execAndCheck(ps); + String output_argument_md5 = md5db.calculateFileMD5(cfg.argumentOutputFile); + + // pipe to stdout + + ps = new ProcessSettings(cfg.getCmdLine(true).split("\\s+")); + ps.setStdoutSettings(new OutputStreamSettings(cfg.pipedOutputFile)); + pc.execAndCheck(ps); + + MD5DB.MD5Match result = md5db.testFileMD5("LoggingIntegrationTest", "LoggingIntegrationTest", cfg.pipedOutputFile, output_argument_md5, false); + if(result.failed) { + final MD5Mismatch failure = new MD5Mismatch(result.actualMD5, result.expectedMD5, result.diffEngineOutput); + Assert.fail(failure.toString()); + } + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/CryptUtilsUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/CryptUtilsUnitTest.java new file mode 100644 index 000000000..beac3ace8 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/CryptUtilsUnitTest.java @@ -0,0 +1,200 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.crypt; + +import org.broadinstitute.gatk.engine.crypt.CryptUtils; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.SkipException; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.security.Key; +import java.security.KeyPair; +import java.security.PrivateKey; +import java.security.PublicKey; +import java.util.Arrays; + +public class CryptUtilsUnitTest extends BaseTest { + + @Test + public void testGenerateValidKeyPairWithDefaultSettings() { + KeyPair keyPair = CryptUtils.generateKeyPair(); + Assert.assertTrue(CryptUtils.keysDecryptEachOther(keyPair.getPrivate(), keyPair.getPublic())); + } + + @DataProvider( name = "InvalidKeyPairSettings" ) + public Object[][] invalidKeyPairSettingsDataProvider() { + return new Object[][] { + { -1, CryptUtils.DEFAULT_ENCRYPTION_ALGORITHM, CryptUtils.DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM}, + { CryptUtils.DEFAULT_KEY_LENGTH, "Made-up algorithm", CryptUtils.DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM}, + { CryptUtils.DEFAULT_KEY_LENGTH, CryptUtils.DEFAULT_ENCRYPTION_ALGORITHM, "Made-up algorithm"} + }; + } + + @Test( dataProvider = "InvalidKeyPairSettings", expectedExceptions = ReviewedGATKException.class ) + public void testGenerateKeyPairWithInvalidSettings( int keyLength, String encryptionAlgorithm, String randomNumberGenerationAlgorithm ) { + KeyPair keyPair = CryptUtils.generateKeyPair(keyLength, encryptionAlgorithm, randomNumberGenerationAlgorithm); + } + + @Test + public void testGATKMasterKeyPairMutualDecryption() { + if ( gatkPrivateKeyExistsButReadPermissionDenied() ) { + throw new SkipException(String.format("Skipping test %s because we do not have permission to read the GATK private key", + "testGATKMasterKeyPairMutualDecryption")); + } + + Assert.assertTrue(CryptUtils.keysDecryptEachOther(CryptUtils.loadGATKMasterPrivateKey(), CryptUtils.loadGATKMasterPublicKey())); + } + + @Test + public void testGATKMasterPrivateKeyWithDistributedPublicKeyMutualDecryption() { + if ( gatkPrivateKeyExistsButReadPermissionDenied() ) { + throw new SkipException(String.format("Skipping test %s because we do not have permission to read the GATK private key", + "testGATKMasterPrivateKeyWithDistributedPublicKeyMutualDecryption")); + } + + Assert.assertTrue(CryptUtils.keysDecryptEachOther(CryptUtils.loadGATKMasterPrivateKey(), CryptUtils.loadGATKDistributedPublicKey())); + } + + @Test + public void testKeyPairWriteThenRead() { + KeyPair keyPair = CryptUtils.generateKeyPair(); + File privateKeyFile = createTempFile("testKeyPairWriteThenRead_private", "key"); + File publicKeyFile = createTempFile("testKeyPairWriteThenRead_public", "key"); + + CryptUtils.writeKeyPair(keyPair, privateKeyFile, publicKeyFile); + + assertKeysAreEqual(keyPair.getPrivate(), CryptUtils.readPrivateKey(privateKeyFile)); + assertKeysAreEqual(keyPair.getPublic(), CryptUtils.readPublicKey(publicKeyFile)); + } + + @Test + public void testPublicKeyWriteThenReadFromFile() { + File keyFile = createTempFile("testPublicKeyWriteThenReadFromFile", "key"); + PublicKey publicKey = CryptUtils.generateKeyPair().getPublic(); + + CryptUtils.writeKey(publicKey, keyFile); + + assertKeysAreEqual(publicKey, CryptUtils.readPublicKey(keyFile)); + } + + @Test + public void testPublicKeyWriteThenReadFromStream() throws IOException { + File keyFile = createTempFile("testPublicKeyWriteThenReadFromStream", "key"); + PublicKey publicKey = CryptUtils.generateKeyPair().getPublic(); + + CryptUtils.writeKey(publicKey, keyFile); + + assertKeysAreEqual(publicKey, CryptUtils.readPublicKey(new FileInputStream(keyFile))); + } + + @Test + public void testPrivateKeyWriteThenReadFromFile() { + File keyFile = createTempFile("testPrivateKeyWriteThenReadFromFile", "key"); + PrivateKey privateKey = CryptUtils.generateKeyPair().getPrivate(); + + CryptUtils.writeKey(privateKey, keyFile); + + assertKeysAreEqual(privateKey, CryptUtils.readPrivateKey(keyFile)); + } + + @Test + public void testPrivateKeyWriteThenReadFromStream() throws IOException { + File keyFile = createTempFile("testPrivateKeyWriteThenReadFromStream", "key"); + PrivateKey privateKey = CryptUtils.generateKeyPair().getPrivate(); + + CryptUtils.writeKey(privateKey, keyFile); + + assertKeysAreEqual(privateKey, CryptUtils.readPrivateKey(new FileInputStream(keyFile))); + } + + @Test( expectedExceptions = UserException.CouldNotReadInputFile.class ) + public void testReadNonExistentPublicKey() { + File nonExistentFile = new File("jdshgkdfhg.key"); + Assert.assertFalse(nonExistentFile.exists()); + + CryptUtils.readPublicKey(nonExistentFile); + } + + @Test( expectedExceptions = UserException.CouldNotReadInputFile.class ) + public void testReadNonExistentPrivateKey() { + File nonExistentFile = new File("jdshgkdfhg.key"); + Assert.assertFalse(nonExistentFile.exists()); + + CryptUtils.readPrivateKey(nonExistentFile); + } + + @Test + public void testDecodePublicKey() { + PublicKey originalKey = CryptUtils.generateKeyPair().getPublic(); + PublicKey decodedKey = CryptUtils.decodePublicKey(originalKey.getEncoded(), CryptUtils.DEFAULT_ENCRYPTION_ALGORITHM); + assertKeysAreEqual(originalKey, decodedKey); + } + + @Test + public void testDecodePrivateKey() { + PrivateKey originalKey = CryptUtils.generateKeyPair().getPrivate(); + PrivateKey decodedKey = CryptUtils.decodePrivateKey(originalKey.getEncoded(), CryptUtils.DEFAULT_ENCRYPTION_ALGORITHM); + assertKeysAreEqual(originalKey, decodedKey); + } + + @Test + public void testLoadGATKMasterPrivateKey() { + if ( gatkPrivateKeyExistsButReadPermissionDenied() ) { + throw new SkipException(String.format("Skipping test %s because we do not have permission to read the GATK private key", + "testLoadGATKMasterPrivateKey")); + } + + PrivateKey gatkMasterPrivateKey = CryptUtils.loadGATKMasterPrivateKey(); + } + + @Test + public void testLoadGATKMasterPublicKey() { + PublicKey gatkMasterPublicKey = CryptUtils.loadGATKMasterPublicKey(); + } + + @Test + public void testLoadGATKDistributedPublicKey() { + PublicKey gatkDistributedPublicKey = CryptUtils.loadGATKDistributedPublicKey(); + } + + private void assertKeysAreEqual( Key originalKey, Key keyFromDisk ) { + Assert.assertTrue(Arrays.equals(originalKey.getEncoded(), keyFromDisk.getEncoded())); + Assert.assertEquals(originalKey.getAlgorithm(), keyFromDisk.getAlgorithm()); + Assert.assertEquals(originalKey.getFormat(), keyFromDisk.getFormat()); + } + + private boolean gatkPrivateKeyExistsButReadPermissionDenied() { + File gatkPrivateKey = new File(CryptUtils.GATK_MASTER_PRIVATE_KEY_FILE); + return gatkPrivateKey.exists() && ! gatkPrivateKey.canRead(); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/GATKKeyIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/GATKKeyIntegrationTest.java new file mode 100644 index 000000000..350ba7b75 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/GATKKeyIntegrationTest.java @@ -0,0 +1,157 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.crypt; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.broadinstitute.gatk.engine.phonehome.GATKRunReport; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class GATKKeyIntegrationTest extends WalkerTest { + + public static final String BASE_COMMAND = String.format("-T TestPrintReadsWalker -R %s -I %s -o %%s", + publicTestDir + "exampleFASTA.fasta", + publicTestDir + "exampleBAM.bam"); + public static final String MD5_UPON_SUCCESSFUL_RUN = "e7b4a5b62f9d4badef1cd07040011b2b"; + + + private void runGATKKeyTest ( String testName, String etArg, String keyArg, Class expectedException, String md5 ) { + String command = BASE_COMMAND + String.format(" %s %s", etArg, keyArg); + + WalkerTestSpec spec = expectedException != null ? + new WalkerTestSpec(command, 1, expectedException) : + new WalkerTestSpec(command, 1, Arrays.asList(md5)); + + spec.disableImplicitArgs(); // Turn off automatic inclusion of -et/-K args by WalkerTest + executeTest(testName, spec); + } + + @Test + public void testValidKeyNoET() { + runGATKKeyTest("testValidKeyNoET", + "-et " + GATKRunReport.PhoneHomeOption.NO_ET, + "-K " + keysDataLocation + "valid.key", + null, + MD5_UPON_SUCCESSFUL_RUN); + } + + @Test + public void testValidKeyETStdout() { + runGATKKeyTest("testValidKeyETStdout", + "-et " + GATKRunReport.PhoneHomeOption.STDOUT, + "-K " + keysDataLocation + "valid.key", + null, + MD5_UPON_SUCCESSFUL_RUN); + } + + @Test + public void testValidKeyETStandard() { + runGATKKeyTest("testValidKeyETStandard", + "", + "-K " + keysDataLocation + "valid.key", + null, + MD5_UPON_SUCCESSFUL_RUN); + } + + @Test + public void testNoKeyNoET() { + runGATKKeyTest("testNoKeyNoET", + "-et " + GATKRunReport.PhoneHomeOption.NO_ET, + "", + UserException.class, + null); + } + + @Test + public void testNoKeyETStdout() { + runGATKKeyTest("testNoKeyETStdout", + "-et " + GATKRunReport.PhoneHomeOption.STDOUT, + "", + UserException.class, + null); + } + + @Test + public void testNoKeyETStandard() { + runGATKKeyTest("testNoKeyETStandard", + "", + "", + null, + MD5_UPON_SUCCESSFUL_RUN); + } + + @Test + public void testRevokedKey() { + runGATKKeyTest("testRevokedKey", + "-et " + GATKRunReport.PhoneHomeOption.NO_ET, + "-K " + keysDataLocation + "revoked.key", + UserException.KeySignatureVerificationException.class, + null); + } + + @DataProvider(name = "CorruptKeyTestData") + public Object[][] corruptKeyDataProvider() { + return new Object[][] { + { "corrupt_empty.key", UserException.UnreadableKeyException.class }, + { "corrupt_single_byte_file.key", UserException.UnreadableKeyException.class }, + { "corrupt_random_contents.key", UserException.UnreadableKeyException.class }, + { "corrupt_single_byte_deletion.key", UserException.UnreadableKeyException.class }, + { "corrupt_single_byte_insertion.key", UserException.UnreadableKeyException.class }, + { "corrupt_single_byte_change.key", UserException.UnreadableKeyException.class }, + { "corrupt_multi_byte_deletion.key", UserException.UnreadableKeyException.class }, + { "corrupt_multi_byte_insertion.key", UserException.UnreadableKeyException.class }, + { "corrupt_multi_byte_change.key", UserException.UnreadableKeyException.class }, + { "corrupt_bad_isize_field.key", UserException.UnreadableKeyException.class }, + { "corrupt_bad_crc.key", UserException.UnreadableKeyException.class }, + { "corrupt_no_email_address.key", UserException.UnreadableKeyException.class }, + { "corrupt_no_sectional_delimiter.key", UserException.UnreadableKeyException.class }, + { "corrupt_no_signature.key", UserException.UnreadableKeyException.class }, + { "corrupt_bad_signature.key", UserException.KeySignatureVerificationException.class }, + { "corrupt_non_gzipped_valid_key.key", UserException.UnreadableKeyException.class } + }; + } + + @Test(dataProvider = "CorruptKeyTestData") + public void testCorruptKey ( String corruptKeyName, Class expectedException ) { + runGATKKeyTest(String.format("testCorruptKey (%s)", corruptKeyName), + "-et " + GATKRunReport.PhoneHomeOption.NO_ET, + "-K " + keysDataLocation + corruptKeyName, + expectedException, + null); + } + + @Test + public void testCorruptButNonRequiredKey() { + runGATKKeyTest("testCorruptButNonRequiredKey", + "", + "-K " + keysDataLocation + "corrupt_random_contents.key", + null, + MD5_UPON_SUCCESSFUL_RUN); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/GATKKeyUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/GATKKeyUnitTest.java new file mode 100644 index 000000000..89ef0b26e --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/GATKKeyUnitTest.java @@ -0,0 +1,129 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.crypt; + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.SkipException; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.io.File; +import java.security.KeyPair; +import java.security.PrivateKey; +import java.security.PublicKey; + +public class GATKKeyUnitTest extends BaseTest { + + @Test + public void testCreateGATKKeyUsingMasterKeyPair() { + if ( gatkPrivateKeyExistsButReadPermissionDenied() ) { + throw new SkipException(String.format("Skipping test %s because we do not have permission to read the GATK private key", + "testCreateGATKKeyUsingMasterKeyPair")); + } + + PrivateKey masterPrivateKey = CryptUtils.loadGATKMasterPrivateKey(); + PublicKey masterPublicKey = CryptUtils.loadGATKMasterPublicKey(); + + // We should be able to create a valid GATKKey using our master key pair: + GATKKey key = new GATKKey(masterPrivateKey, masterPublicKey, "foo@bar.com"); + Assert.assertTrue(key.isValid()); + } + + @Test + public void testCreateGATKKeyUsingMasterPrivateKeyAndDistributedPublicKey() { + if ( gatkPrivateKeyExistsButReadPermissionDenied() ) { + throw new SkipException(String.format("Skipping test %s because we do not have permission to read the GATK private key", + "testCreateGATKKeyUsingMasterPrivateKeyAndDistributedPublicKey")); + } + + PrivateKey masterPrivateKey = CryptUtils.loadGATKMasterPrivateKey(); + PublicKey distributedPublicKey = CryptUtils.loadGATKDistributedPublicKey(); + + // We should also be able to create a valid GATKKey using our master private + // key and the public key we distribute with the GATK: + GATKKey key = new GATKKey(masterPrivateKey, distributedPublicKey, "foo@bar.com"); + Assert.assertTrue(key.isValid()); + } + + @Test( expectedExceptions = ReviewedGATKException.class ) + public void testKeyPairMismatch() { + KeyPair firstKeyPair = CryptUtils.generateKeyPair(); + KeyPair secondKeyPair = CryptUtils.generateKeyPair(); + + // Attempting to create a GATK Key with private and public keys that aren't part of the + // same key pair should immediately trigger a validation failure: + GATKKey key = new GATKKey(firstKeyPair.getPrivate(), secondKeyPair.getPublic(), "foo@bar.com"); + } + + @Test( expectedExceptions = ReviewedGATKException.class ) + public void testEncryptionAlgorithmMismatch() { + KeyPair keyPair = CryptUtils.generateKeyPair(CryptUtils.DEFAULT_KEY_LENGTH, "DSA", CryptUtils.DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM); + + // Attempting to use a DSA private key to create an RSA signature should throw an error: + GATKKey key = new GATKKey(keyPair.getPrivate(), keyPair.getPublic(), "foo@bar.com", "SHA1withRSA"); + } + + @Test( expectedExceptions = UserException.class ) + public void testInvalidEmailAddress() { + String emailAddressWithNulByte = new String(new byte[] { 0 }); + KeyPair keyPair = CryptUtils.generateKeyPair(); + + // Email addresses cannot contain the NUL byte, since it's used as a sectional delimiter in the key file: + GATKKey key = new GATKKey(keyPair.getPrivate(), keyPair.getPublic(), emailAddressWithNulByte); + } + + @Test + public void testCreateGATKKeyFromValidKeyFile() { + GATKKey key = new GATKKey(CryptUtils.loadGATKDistributedPublicKey(), new File(keysDataLocation + "valid.key")); + Assert.assertTrue(key.isValid()); + } + + @Test( expectedExceptions = UserException.UnreadableKeyException.class ) + public void testCreateGATKKeyFromCorruptKeyFile() { + GATKKey key = new GATKKey(CryptUtils.loadGATKDistributedPublicKey(), new File(keysDataLocation + "corrupt_random_contents.key")); + } + + @Test + public void testCreateGATKKeyFromRevokedKeyFile() { + GATKKey key = new GATKKey(CryptUtils.loadGATKDistributedPublicKey(), new File(keysDataLocation + "revoked.key")); + Assert.assertFalse(key.isValid()); + } + + @Test( expectedExceptions = UserException.CouldNotReadInputFile.class ) + public void testCreateGATKKeyFromNonExistentFile() { + File nonExistentFile = new File("ghfdkgsdhg.key"); + Assert.assertFalse(nonExistentFile.exists()); + + GATKKey key = new GATKKey(CryptUtils.loadGATKDistributedPublicKey(), nonExistentFile); + } + + private boolean gatkPrivateKeyExistsButReadPermissionDenied() { + File gatkPrivateKey = new File(CryptUtils.GATK_MASTER_PRIVATE_KEY_FILE); + return gatkPrivateKey.exists() && ! gatkPrivateKey.canRead(); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/AllLocusViewUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/AllLocusViewUnitTest.java new file mode 100644 index 000000000..99d7559c4 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/AllLocusViewUnitTest.java @@ -0,0 +1,90 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + + +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; + +import java.util.List; +/** + * User: hanna + * Date: May 12, 2009 + * Time: 2:34:46 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * Test the view of all loci. + */ +public class AllLocusViewUnitTest extends LocusViewTemplate { + + @Override + protected LocusView createView(LocusShardDataProvider provider) { + return new AllLocusView(provider); + } + + /** + * Test the reads according to an independently derived context. + * @param view + * @param range + * @param reads + */ + @Override + protected void testReadsInContext( LocusView view, List range, List reads ) { + AllLocusView allLocusView = (AllLocusView)view; + + // TODO: Should skip over loci not in the given range. + GenomeLoc firstLoc = range.get(0); + GenomeLoc lastLoc = range.get(range.size()-1); + GenomeLoc bounds = genomeLocParser.createGenomeLoc(firstLoc.getContig(),firstLoc.getStart(),lastLoc.getStop()); + + for( int i = bounds.getStart(); i <= bounds.getStop(); i++ ) { + GenomeLoc site = genomeLocParser.createGenomeLoc("chr1",i); + AlignmentContext locusContext = allLocusView.next(); + Assert.assertEquals(locusContext.getLocation(), site, "Locus context location is incorrect"); + int expectedReadsAtSite = 0; + + for( GATKSAMRecord read: reads ) { + if(genomeLocParser.createGenomeLoc(read).containsP(locusContext.getLocation())) { + Assert.assertTrue(locusContext.getReads().contains(read),"Target locus context does not contain reads"); + expectedReadsAtSite++; + } + } + + Assert.assertEquals(locusContext.getReads().size(), expectedReadsAtSite, "Found wrong number of reads at site"); + } + + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/CoveredLocusViewUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/CoveredLocusViewUnitTest.java new file mode 100644 index 000000000..6665b7481 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/CoveredLocusViewUnitTest.java @@ -0,0 +1,102 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + + +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; + +import java.util.List; +/** + * User: hanna + * Date: May 12, 2009 + * Time: 2:34:46 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * Test the CoveredLocusView. + */ +public class CoveredLocusViewUnitTest extends LocusViewTemplate { + + /** + * Retrieve a covered locus view. + */ + @Override + protected LocusView createView(LocusShardDataProvider provider) { + return new CoveredLocusView(provider); + } + + /** + * Test the reads according to an independently derived context. + * @param view + * @param range + * @param reads + */ + @Override + protected void testReadsInContext( LocusView view, List range, List reads ) { + CoveredLocusView coveredLocusView = (CoveredLocusView)view; + + // TODO: Should skip over loci not in the given range. + GenomeLoc firstLoc = range.get(0); + GenomeLoc lastLoc = range.get(range.size()-1); + GenomeLoc bounds = genomeLocParser.createGenomeLoc(firstLoc.getContig(),firstLoc.getStart(),lastLoc.getStop()); + + for( int i = bounds.getStart(); i <= bounds.getStop(); i++ ) { + GenomeLoc site = genomeLocParser.createGenomeLoc("chr1",i); + + int expectedReadsAtSite = 0; + for( GATKSAMRecord read: reads ) { + if( genomeLocParser.createGenomeLoc(read).containsP(site) ) + expectedReadsAtSite++; + } + + if( expectedReadsAtSite < 1 ) + continue; + + Assert.assertTrue(coveredLocusView.hasNext(),"Incorrect number of loci in view"); + + AlignmentContext locusContext = coveredLocusView.next(); + Assert.assertEquals(locusContext.getLocation(), site, "Target locus context location is incorrect"); + Assert.assertEquals(locusContext.getReads().size(), expectedReadsAtSite, "Found wrong number of reads at site"); + + for( GATKSAMRecord read: reads ) { + if(genomeLocParser.createGenomeLoc(read).containsP(locusContext.getLocation())) + Assert.assertTrue(locusContext.getReads().contains(read),"Target locus context does not contain reads"); + } + } + + Assert.assertFalse(coveredLocusView.hasNext(),"Iterator is not bounded at boundaries of shard"); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalReferenceOrderedViewUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalReferenceOrderedViewUnitTest.java new file mode 100644 index 000000000..791046a77 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalReferenceOrderedViewUnitTest.java @@ -0,0 +1,366 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import htsjdk.samtools.util.PeekableIterator; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.tribble.SimpleFeature; +import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.commandline.RodBinding; +import org.broadinstitute.gatk.utils.refdata.RODRecordListImpl; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.utils.GATKFeature; +import org.broadinstitute.gatk.utils.refdata.utils.RODRecordList; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * @author depristo + */ +public class IntervalReferenceOrderedViewUnitTest extends BaseTest { + private static int startingChr = 1; + private static int endingChr = 2; + private static int readCount = 100; + private static int DEFAULT_READ_LENGTH = ArtificialSAMUtils.DEFAULT_READ_LENGTH; + private static String contig; + private static SAMFileHeader header; + + private GenomeLocParser genomeLocParser; + + @BeforeClass + public void beforeClass() { + header = ArtificialSAMUtils.createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + contig = header.getSequence(0).getSequenceName(); + genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + + initializeTests(); + } + + private class CompareFeatures implements Comparator { + @Override + public int compare(Feature o1, Feature o2) { + return genomeLocParser.createGenomeLoc(o1).compareTo(genomeLocParser.createGenomeLoc(o2)); + } + } + + private class ReadMetaDataTrackerRODStreamTest extends TestDataProvider { + final List allFeatures; + final List intervals; + + public ReadMetaDataTrackerRODStreamTest(final List allFeatures, final GenomeLoc interval) { + this(allFeatures, Collections.singletonList(interval)); + } + + public ReadMetaDataTrackerRODStreamTest(final List allFeatures, final List intervals) { + super(ReadMetaDataTrackerRODStreamTest.class); + this.allFeatures = new ArrayList(allFeatures); + Collections.sort(this.allFeatures, new CompareFeatures()); + this.intervals = new ArrayList(intervals); + Collections.sort(this.intervals); + setName(String.format("%s nFeatures %d intervals %s", getClass().getSimpleName(), allFeatures.size(), + intervals.size() == 1 ? intervals.get(0) : "size " + intervals.size())); + } + + public PeekableIterator getIterator(final String name) { + return new PeekableIterator(new TribbleIteratorFromCollection(name, genomeLocParser, allFeatures)); + } + + public Set getExpectedOverlaps(final GenomeLoc interval) { + final Set overlapping = new HashSet(); + for ( final Feature f : allFeatures ) + if ( genomeLocParser.createGenomeLoc(f).overlapsP(interval) ) + overlapping.add(f); + return overlapping; + } + } + + public void initializeTests() { + final List handPickedFeatures = new ArrayList(); + + handPickedFeatures.add(new SimpleFeature(contig, 1, 1)); + handPickedFeatures.add(new SimpleFeature(contig, 2, 5)); + handPickedFeatures.add(new SimpleFeature(contig, 4, 4)); + handPickedFeatures.add(new SimpleFeature(contig, 6, 6)); + handPickedFeatures.add(new SimpleFeature(contig, 9, 10)); + handPickedFeatures.add(new SimpleFeature(contig, 10, 10)); + handPickedFeatures.add(new SimpleFeature(contig, 10, 11)); + handPickedFeatures.add(new SimpleFeature(contig, 13, 20)); + + createTestsForFeatures(handPickedFeatures); + + // test in the present of a large spanning element + { + List oneLargeSpan = new ArrayList(handPickedFeatures); + oneLargeSpan.add(new SimpleFeature(contig, 1, 30)); + createTestsForFeatures(oneLargeSpan); + } + + // test in the presence of a partially spanning element + { + List partialSpanStart = new ArrayList(handPickedFeatures); + partialSpanStart.add(new SimpleFeature(contig, 1, 6)); + createTestsForFeatures(partialSpanStart); + } + + // test in the presence of a partially spanning element at the end + { + List partialSpanEnd = new ArrayList(handPickedFeatures); + partialSpanEnd.add(new SimpleFeature(contig, 10, 30)); + createTestsForFeatures(partialSpanEnd); + } + + // no data at all + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, 5, 5); + new ReadMetaDataTrackerRODStreamTest(Collections.emptyList(), loc); + } + + // -------------------------------------------------------------------------------- + // + // tests for the lower level IntervalOverlappingRODsFromStream + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "ReadMetaDataTrackerRODStreamTest") + public Object[][] createReadMetaDataTrackerRODStreamTest() { + return ReadMetaDataTrackerRODStreamTest.getTests(ReadMetaDataTrackerRODStreamTest.class); + } + + private GenomeLoc span(final List features) { + int featuresStart = 1; for ( final GenomeLoc f : features ) featuresStart = Math.min(featuresStart, f.getStart()); + int featuresStop = 1; for ( final GenomeLoc f : features ) featuresStop = Math.max(featuresStop, f.getStop()); + return genomeLocParser.createGenomeLoc(contig, featuresStart, featuresStop); + } + + private void createTestsForFeatures(final List features) { + int featuresStart = 1; for ( final Feature f : features ) featuresStart = Math.min(featuresStart, f.getStart()); + int featuresStop = 1; for ( final Feature f : features ) featuresStop = Math.max(featuresStop, f.getEnd()); + + for ( final int size : Arrays.asList(1, 5, 10, 100) ) { + final List allIntervals = new ArrayList(); + // regularly spaced + for ( int start = featuresStart; start < featuresStop; start++) { + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, start, start + size - 1); + allIntervals.add(loc); + new ReadMetaDataTrackerRODStreamTest(features, loc); + } + + // starting and stopping at every feature + for ( final Feature f : features ) { + // just at the feature + allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart(), f.getEnd())); + new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); + + // up to end + allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart() - 1, f.getEnd())); + new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); + + // missing by 1 + allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart() + 1, f.getEnd() + 1)); + new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); + + // just spanning + allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart() - 1, f.getEnd() + 1)); + new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); + } + + new ReadMetaDataTrackerRODStreamTest(features, allIntervals); + } + } + + @Test(enabled = true, dataProvider = "ReadMetaDataTrackerRODStreamTest") + public void runReadMetaDataTrackerRODStreamTest_singleQuery(final ReadMetaDataTrackerRODStreamTest data) { + if ( data.intervals.size() == 1 ) { + final String name = "testName"; + final PeekableIterator iterator = data.getIterator(name); + final IntervalOverlappingRODsFromStream stream = new IntervalOverlappingRODsFromStream(name, iterator); + testRODStream(data, stream, Collections.singletonList(data.intervals.get(0))); + } + } + + @Test(enabled = true, dataProvider = "ReadMetaDataTrackerRODStreamTest", dependsOnMethods = "runReadMetaDataTrackerRODStreamTest_singleQuery") + public void runReadMetaDataTrackerRODStreamTest_multipleQueries(final ReadMetaDataTrackerRODStreamTest data) { + if ( data.intervals.size() > 1 ) { + final String name = "testName"; + final PeekableIterator iterator = data.getIterator(name); + final IntervalOverlappingRODsFromStream stream = new IntervalOverlappingRODsFromStream(name, iterator); + testRODStream(data, stream, data.intervals); + } + } + + private void testRODStream(final ReadMetaDataTrackerRODStreamTest test, final IntervalOverlappingRODsFromStream stream, final List intervals) { + for ( final GenomeLoc interval : intervals ) { + final RODRecordList query = stream.getOverlapping(interval); + final HashSet queryFeatures = new HashSet(); + for ( final GATKFeature f : query ) queryFeatures.add((Feature)f.getUnderlyingObject()); + final Set overlaps = test.getExpectedOverlaps(interval); + + Assert.assertEquals(queryFeatures.size(), overlaps.size(), "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected size = " + overlaps.size() + " but saw " + queryFeatures.size()); + + BaseTest.assertEqualsSet(queryFeatures, overlaps, "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected = " + Utils.join(",", overlaps) + " but saw " + Utils.join(",", queryFeatures)); + } + } + + // -------------------------------------------------------------------------------- + // + // tests for the higher level tracker itself + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "ReadMetaDataTrackerTests") + public Object[][] createTrackerTests() { + List tests = new ArrayList(); + + final Object[][] singleTests = ReadMetaDataTrackerRODStreamTest.getTests(ReadMetaDataTrackerRODStreamTest.class); + final List multiSiteTests = new ArrayList(); + for ( final Object[] singleTest : singleTests ) { + if ( ((ReadMetaDataTrackerRODStreamTest)singleTest[0]).intervals.size() > 1 ) + multiSiteTests.add((ReadMetaDataTrackerRODStreamTest)singleTest[0]); + } + + for ( final boolean testStateless : Arrays.asList(true, false) ) { + // all pairwise tests + for ( List singleTest : Utils.makePermutations(multiSiteTests, 2, false)) { + tests.add(new Object[]{singleTest, testStateless}); + } + + // all 3 way pairwise tests + //for ( List singleTest : Utils.makePermutations(multiSiteTests, 3, false)) { + // tests.add(new Object[]{singleTest, testStateless}); + //} + } + + logger.warn("Creating " + tests.size() + " tests for ReadMetaDataTrackerTests"); + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "ReadMetaDataTrackerTests", dependsOnMethods = "runReadMetaDataTrackerRODStreamTest_multipleQueries") + public void runReadMetaDataTrackerTest(final List RODs, final boolean testStateless) { + final List names = new ArrayList(); + final List> iterators = new ArrayList>(); + final List intervals = new ArrayList(); + final List> rodBindings = new ArrayList>(); + + for ( int i = 0; i < RODs.size(); i++ ) { + final RodBinding rodBinding = new RodBinding(Feature.class, "name"+i); + rodBindings.add(rodBinding); + final String name = rodBinding.getName(); + names.add(name); + iterators.add(RODs.get(i).getIterator(name)); + intervals.addAll(RODs.get(i).intervals); + } + + Collections.sort(intervals); + final GenomeLoc span = span(intervals); + final IntervalReferenceOrderedView view = new IntervalReferenceOrderedView(genomeLocParser, span, names, iterators); + + if ( testStateless ) { + // test each tracker is well formed, as each is created + for ( final GenomeLoc interval : intervals ) { + final RefMetaDataTracker tracker = view.getReferenceOrderedDataForInterval(interval); + testMetaDataTrackerBindings(tracker, interval, RODs, rodBindings); + } + } else { + // tests all trackers are correct after reading them into an array + // this checks that the trackers are be safely stored away and analyzed later (critical for nano-scheduling) + final List trackers = new ArrayList(); + for ( final GenomeLoc interval : intervals ) { + final RefMetaDataTracker tracker = view.getReferenceOrderedDataForInterval(interval); + trackers.add(tracker); + } + + for ( int i = 0; i < trackers.size(); i++) { + testMetaDataTrackerBindings(trackers.get(i), intervals.get(i), RODs, rodBindings); + } + } + } + + private void testMetaDataTrackerBindings(final RefMetaDataTracker tracker, + final GenomeLoc interval, + final List RODs, + final List> rodBindings) { + for ( int i = 0; i < RODs.size(); i++ ) { + final ReadMetaDataTrackerRODStreamTest test = RODs.get(i); + final List queryFeaturesList = tracker.getValues(rodBindings.get(i)); + final Set queryFeatures = new HashSet(queryFeaturesList); + final Set overlaps = test.getExpectedOverlaps(interval); + + Assert.assertEquals(queryFeatures.size(), overlaps.size(), "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected size = " + overlaps.size() + " but saw " + queryFeatures.size()); + + BaseTest.assertEqualsSet(queryFeatures, overlaps, "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected = " + Utils.join(",", overlaps) + " but saw " + Utils.join(",", queryFeatures)); + } + } + + static class TribbleIteratorFromCollection implements Iterator { + // current location + private final String name; + final Queue gatkFeatures; + + public TribbleIteratorFromCollection(final String name, final GenomeLocParser genomeLocParser, final List features) { + this.name = name; + + this.gatkFeatures = new LinkedList(); + for ( final Feature f : features ) + gatkFeatures.add(new GATKFeature.TribbleGATKFeature(genomeLocParser, f, name)); + } + + @Override + public boolean hasNext() { + return ! gatkFeatures.isEmpty(); + } + + @Override + public RODRecordList next() { + final GATKFeature first = gatkFeatures.poll(); + final Collection myFeatures = new LinkedList(); + myFeatures.add(first); + while ( gatkFeatures.peek() != null && gatkFeatures.peek().getLocation().getStart() == first.getStart() ) + myFeatures.add(gatkFeatures.poll()); + + GenomeLoc loc = first.getLocation(); + for ( final GATKFeature feature : myFeatures ) + loc = loc.merge(feature.getLocation()); + + return new RODRecordListImpl(name, myFeatures, loc); // is this safe? + } + + @Override public void remove() { throw new IllegalStateException("GRRR"); } + } +} + + diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceViewUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceViewUnitTest.java new file mode 100644 index 000000000..3f620f900 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceViewUnitTest.java @@ -0,0 +1,143 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import org.broadinstitute.gatk.engine.datasources.reads.Shard; +import org.testng.Assert; +import org.testng.annotations.Test; + +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.engine.datasources.reads.MockLocusShard; +import org.broadinstitute.gatk.engine.iterators.GenomeLocusIterator; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; + +import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.util.StringUtil; + +import java.util.Collections; +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** Tests for viewing the reference from the perspective of a locus. */ + +public class LocusReferenceViewUnitTest extends ReferenceViewTemplate { + +// +// /** Multiple-base pair queries should generate exceptions. */ +// @Test(expectedExceptions=InvalidPositionException.class) +// public void testSingleBPFailure() { +// Shard shard = new LocusShard(GenomeLocParser.createGenomeLoc(0, 1, 50)); +// +// ShardDataProvider dataProvider = new ShardDataProvider(shard, null, sequenceFile, null); +// LocusReferenceView view = new LocusReferenceView(dataProvider); +// +// view.getReferenceContext(shard.getGenomeLoc()).getBase(); +// } + + @Test + public void testOverlappingReferenceBases() { + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc(sequenceFile.getSequenceDictionary().getSequence(0).getSequenceName(), + sequenceFile.getSequence("chrM").length() - 10, + sequenceFile.getSequence("chrM").length()))); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, sequenceFile, null); + LocusReferenceView view = new LocusReferenceView(dataProvider); + + byte[] results = view.getReferenceBases(genomeLocParser.createGenomeLoc(sequenceFile.getSequenceDictionary().getSequence(0).getSequenceName(), + sequenceFile.getSequence("chrM").length() - 10, + sequenceFile.getSequence("chrM").length() + 9)); + System.out.printf("results are %s%n", new String(results)); + Assert.assertEquals(results.length, 20); + for (int x = 0; x < results.length; x++) { + if (x <= 10) Assert.assertTrue(results[x] != 'X'); + else Assert.assertTrue(results[x] == 'X'); + } + } + + + /** Queries outside the bounds of the shard should result in reference context window trimmed at the shard boundary. */ + @Test + public void testBoundsFailure() { + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc(sequenceFile.getSequenceDictionary().getSequence(0).getSequenceName(), 1, 50))); + + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, sequenceFile, null); + LocusReferenceView view = new LocusReferenceView(dataProvider); + + GenomeLoc locus = genomeLocParser.createGenomeLoc(sequenceFile.getSequenceDictionary().getSequence(0).getSequenceName(), 50, 51); + + ReferenceContext rc = view.getReferenceContext(locus); + Assert.assertTrue(rc.getLocus().equals(locus)); + Assert.assertTrue(rc.getWindow().equals(genomeLocParser.createGenomeLoc(sequenceFile.getSequenceDictionary().getSequence(0).getSequenceName(),50))); + Assert.assertTrue(rc.getBases().length == 1); + } + + + /** + * Compares the contents of the fasta and view at a specified location. + * + * @param loc + */ + protected void validateLocation( GenomeLoc loc ) { + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(loc)); + GenomeLocusIterator shardIterator = new GenomeLocusIterator(genomeLocParser,loc); + + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, loc, null, sequenceFile, null); + LocusReferenceView view = new LocusReferenceView(dataProvider); + + while (shardIterator.hasNext()) { + GenomeLoc locus = shardIterator.next(); + + ReferenceSequence expectedAsSeq = sequenceFile.getSubsequenceAt(locus.getContig(), locus.getStart(), locus.getStop()); + char expected = Character.toUpperCase(StringUtil.bytesToString(expectedAsSeq.getBases()).charAt(0)); + char actual = view.getReferenceContext(locus).getBaseAsChar(); + + Assert.assertEquals(actual, expected, String.format("Value of base at position %s in shard %s does not match expected", locus.toString(), shard.getGenomeLocs()) + ); + } + } + +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/LocusViewTemplate.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/LocusViewTemplate.java new file mode 100644 index 000000000..72f2bb1ee --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/LocusViewTemplate.java @@ -0,0 +1,405 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.samtools.*; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.engine.datasources.reads.MockLocusShard; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.broadinstitute.gatk.engine.datasources.reads.Shard; +import org.broadinstitute.gatk.engine.executive.WindowMaker; +import org.broadinstitute.gatk.engine.datasources.reads.LocusShard; +import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.*; +/** + * User: hanna + * Date: May 13, 2009 + * Time: 4:29:08 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** Base support for testing variants of the LocusView family of classes. */ + +public abstract class LocusViewTemplate extends BaseTest { + protected static ReferenceSequenceFile sequenceSourceFile = null; + protected GenomeLocParser genomeLocParser = null; + + @BeforeClass + public void setupGenomeLoc() throws FileNotFoundException { + sequenceSourceFile = fakeReferenceSequenceFile(); + genomeLocParser = new GenomeLocParser(sequenceSourceFile); + } + + @Test + public void emptyAlignmentContextTest() { + SAMRecordIterator iterator = new SAMRecordIterator(); + + GenomeLoc shardBounds = genomeLocParser.createGenomeLoc("chr1", 1, 5); + Shard shard = new LocusShard(genomeLocParser, new SAMDataSource(null,Collections.emptyList(),new ThreadAllocation(),null,genomeLocParser),Collections.singletonList(shardBounds),Collections.emptyMap()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, null, null); + + LocusView view = createView(dataProvider); + + testReadsInContext(view, shard.getGenomeLocs(), Collections.emptyList()); + } + + @Test + public void singleReadTest() { + GATKSAMRecord read = buildSAMRecord("read1","chr1", 1, 5); + SAMRecordIterator iterator = new SAMRecordIterator(read); + + GenomeLoc shardBounds = genomeLocParser.createGenomeLoc("chr1", 1, 5); + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(shardBounds)); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + + LocusView view = createView(dataProvider); + + testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); + } + + @Test + public void readCoveringFirstPartTest() { + GATKSAMRecord read = buildSAMRecord("read1","chr1", 1, 5); + SAMRecordIterator iterator = new SAMRecordIterator(read); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); + } + + @Test + public void readCoveringLastPartTest() { + GATKSAMRecord read = buildSAMRecord("read1","chr1", 6, 10); + SAMRecordIterator iterator = new SAMRecordIterator(read); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); + } + + @Test + public void readCoveringMiddleTest() { + GATKSAMRecord read = buildSAMRecord("read1","chr1", 3, 7); + SAMRecordIterator iterator = new SAMRecordIterator(read); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); + } + + @Test + public void readAndLocusOverlapAtLastBase() { + GATKSAMRecord read = buildSAMRecord("read1","chr1", 1, 5); + SAMRecordIterator iterator = new SAMRecordIterator(read); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 5, 5))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); + } + + @Test + public void readOverlappingStartTest() { + GATKSAMRecord read = buildSAMRecord("read1","chr1", 1, 10); + SAMRecordIterator iterator = new SAMRecordIterator(read); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 6, 15))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); + } + + @Test + public void readOverlappingEndTest() { + GATKSAMRecord read = buildSAMRecord("read1","chr1", 6, 15); + SAMRecordIterator iterator = new SAMRecordIterator(read); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); + } + + @Test + public void readsSpanningTest() { + GATKSAMRecord read1 = buildSAMRecord("read1","chr1", 1, 5); + GATKSAMRecord read2 = buildSAMRecord("read2","chr1", 6, 10); + SAMRecordIterator iterator = new SAMRecordIterator(read1, read2); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + List expectedReads = new ArrayList(); + Collections.addAll(expectedReads, read1, read2); + testReadsInContext(view, shard.getGenomeLocs(), expectedReads); + } + + @Test + public void duplicateReadsTest() { + GATKSAMRecord read1 = buildSAMRecord("read1","chr1", 1, 5); + GATKSAMRecord read2 = buildSAMRecord("read2","chr1", 1, 5); + GATKSAMRecord read3 = buildSAMRecord("read3","chr1", 6, 10); + GATKSAMRecord read4 = buildSAMRecord("read4","chr1", 6, 10); + SAMRecordIterator iterator = new SAMRecordIterator(read1, read2, read3, read4); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + List expectedReads = new ArrayList(); + Collections.addAll(expectedReads, read1, read2, read3, read4); + testReadsInContext(view, shard.getGenomeLocs(), expectedReads); + } + + @Test + public void cascadingReadsWithinBoundsTest() { + GATKSAMRecord read1 = buildSAMRecord("read1","chr1", 2, 6); + GATKSAMRecord read2 = buildSAMRecord("read2","chr1", 3, 7); + GATKSAMRecord read3 = buildSAMRecord("read3","chr1", 4, 8); + GATKSAMRecord read4 = buildSAMRecord("read4","chr1", 5, 9); + SAMRecordIterator iterator = new SAMRecordIterator(read1, read2, read3, read4); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + List expectedReads = new ArrayList(); + Collections.addAll(expectedReads, read1, read2, read3, read4); + testReadsInContext(view, shard.getGenomeLocs(), expectedReads); + } + + @Test + public void cascadingReadsAtBoundsTest() { + GATKSAMRecord read1 = buildSAMRecord("read1","chr1", 1, 5); + GATKSAMRecord read2 = buildSAMRecord("read2","chr1", 2, 6); + GATKSAMRecord read3 = buildSAMRecord("read3","chr1", 3, 7); + GATKSAMRecord read4 = buildSAMRecord("read4","chr1", 4, 8); + GATKSAMRecord read5 = buildSAMRecord("read5","chr1", 5, 9); + GATKSAMRecord read6 = buildSAMRecord("read6","chr1", 6, 10); + SAMRecordIterator iterator = new SAMRecordIterator(read1, read2, read3, read4, read5, read6); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + List expectedReads = new ArrayList(); + Collections.addAll(expectedReads, read1, read2, read3, read4, read5, read6); + testReadsInContext(view, shard.getGenomeLocs(), expectedReads); + } + + @Test + public void cascadingReadsOverlappingBoundsTest() { + GATKSAMRecord read01 = buildSAMRecord("read1","chr1", 1, 5); + GATKSAMRecord read02 = buildSAMRecord("read2","chr1", 2, 6); + GATKSAMRecord read03 = buildSAMRecord("read3","chr1", 3, 7); + GATKSAMRecord read04 = buildSAMRecord("read4","chr1", 4, 8); + GATKSAMRecord read05 = buildSAMRecord("read5","chr1", 5, 9); + GATKSAMRecord read06 = buildSAMRecord("read6","chr1", 6, 10); + GATKSAMRecord read07 = buildSAMRecord("read7","chr1", 7, 11); + GATKSAMRecord read08 = buildSAMRecord("read8","chr1", 8, 12); + GATKSAMRecord read09 = buildSAMRecord("read9","chr1", 9, 13); + GATKSAMRecord read10 = buildSAMRecord("read10","chr1", 10, 14); + GATKSAMRecord read11 = buildSAMRecord("read11","chr1", 11, 15); + GATKSAMRecord read12 = buildSAMRecord("read12","chr1", 12, 16); + SAMRecordIterator iterator = new SAMRecordIterator(read01, read02, read03, read04, read05, read06, + read07, read08, read09, read10, read11, read12); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 6, 15))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + List expectedReads = new ArrayList(); + Collections.addAll(expectedReads, read01, read02, read03, read04, read05, read06, + read07, read08, read09, read10, read11, read12); + testReadsInContext(view, shard.getGenomeLocs(), expectedReads); + } + + /** + * Creates a view of the type required for testing. + * + * @return The correct view to test. + */ + protected abstract LocusView createView(LocusShardDataProvider provider); + + /** + * Test the reads according to an independently derived context. + * + * @param view + * @param bounds + * @param reads + */ + protected abstract void testReadsInContext(LocusView view, List bounds, List reads); + + /** + * Fake a reference sequence file. Essentially, seek a header with a bunch of dummy data. + * + * @return A 'fake' reference sequence file + */ + private static ReferenceSequenceFile fakeReferenceSequenceFile() { + return new ReferenceSequenceFile() { + public SAMSequenceDictionary getSequenceDictionary() { + SAMSequenceRecord sequenceRecord = new SAMSequenceRecord("chr1", 1000000); + SAMSequenceDictionary dictionary = new SAMSequenceDictionary(Collections.singletonList(sequenceRecord)); + return dictionary; + } + + public boolean isIndexed() { return false; } + + public ReferenceSequence nextSequence() { + throw new UnsupportedOperationException("Fake implementation doesn't support a getter"); + } + + public ReferenceSequence getSequence( String contig ) { + throw new UnsupportedOperationException("Fake implementation doesn't support a getter"); + } + + public ReferenceSequence getSubsequenceAt( String contig, long start, long stop ) { + throw new UnsupportedOperationException("Fake implementation doesn't support a getter"); + } + + public void reset() { + return; + } + + public void close() throws IOException { + } + }; + } + + /** + * Build a SAM record featuring the absolute minimum required dataset. + * + * @param contig Contig to populate. + * @param alignmentStart start of alignment + * @param alignmentEnd end of alignment + * + * @return New SAM Record + */ + protected GATKSAMRecord buildSAMRecord(String readName, String contig, int alignmentStart, int alignmentEnd) { + SAMFileHeader header = new SAMFileHeader(); + header.setSequenceDictionary(sequenceSourceFile.getSequenceDictionary()); + + GATKSAMRecord record = new GATKSAMRecord(header); + + record.setReadName(readName); + record.setReferenceIndex(sequenceSourceFile.getSequenceDictionary().getSequenceIndex(contig)); + record.setAlignmentStart(alignmentStart); + Cigar cigar = new Cigar(); + int len = alignmentEnd - alignmentStart + 1; + cigar.add(new CigarElement(len, CigarOperator.M)); + record.setCigar(cigar); + record.setReadBases(new byte[len]); + record.setBaseQualities(new byte[len]); + return record; + } + + /** A simple iterator which iterates over a list of reads. */ + protected class SAMRecordIterator implements GATKSAMIterator { + private Iterator backingIterator = null; + + public SAMRecordIterator(SAMRecord... reads) { + List backingList = new ArrayList(); + backingList.addAll(Arrays.asList(reads)); + backingIterator = backingList.iterator(); + } + + public boolean hasNext() { + return backingIterator.hasNext(); + } + + public SAMRecord next() { + return backingIterator.next(); + } + + public Iterator iterator() { + return this; + } + + public void close() { + // NO-OP. + } + + public void remove() { + throw new UnsupportedOperationException("Can't remove from a read-only iterator"); + } + } + + +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReadReferenceViewUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReadReferenceViewUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReadReferenceViewUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReadReferenceViewUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedViewUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedViewUnitTest.java new file mode 100644 index 000000000..dbc2f5518 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedViewUnitTest.java @@ -0,0 +1,157 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.utils.commandline.RodBinding; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.engine.datasources.reads.MockLocusShard; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.utils.refdata.tracks.RMDTrackBuilder; +import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet; +import org.testng.Assert; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.engine.datasources.reads.Shard; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.codecs.table.TableFeature; +import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet.RMDStorageType; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; + +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.Arrays; +import java.util.Collections; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +/** + * User: hanna + * Date: May 27, 2009 + * Time: 3:07:23 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * Test the transparent view into the reference-ordered data. At the moment, just do some basic bindings and make + * sure the data comes through correctly. + */ +public class ReferenceOrderedViewUnitTest extends BaseTest { + /** + * Sequence file. + */ + private static IndexedFastaSequenceFile seq; + private GenomeLocParser genomeLocParser; + + /** + * our track builder + */ + RMDTrackBuilder builder = null; + + @BeforeClass + public void init() throws FileNotFoundException { + // sequence + seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); + genomeLocParser = new GenomeLocParser(seq); + // disable auto-index creation/locking in the RMDTrackBuilder for tests + builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null,true,null); + } + + /** + * Make sure binding to an empty list produces an empty tracker. + */ + @Test + public void testNoBindings() { + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chrM",1,30))); + LocusShardDataProvider provider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, seq, Collections.emptyList()); + ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); + + RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",10)); + Assert.assertEquals(tracker.getValues(Feature.class).size(), 0, "The tracker should not have produced any data"); + } + + /** + * Test a single ROD binding. + */ + @Test + public void testSingleBinding() { + String fileName = privateTestDir + "TabularDataTest.dat"; + RMDTriplet triplet = new RMDTriplet("tableTest","Table",fileName,RMDStorageType.FILE,new Tags()); + ReferenceOrderedDataSource dataSource = new ReferenceOrderedDataSource(triplet,builder,seq.getSequenceDictionary(),genomeLocParser,false); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chrM",1,30))); + + LocusShardDataProvider provider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, seq, Collections.singletonList(dataSource)); + ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); + + RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20)); + TableFeature datum = tracker.getFirstValue(new RodBinding(TableFeature.class, "tableTest")); + + Assert.assertEquals(datum.get("COL1"),"C","datum parameter for COL1 is incorrect"); + Assert.assertEquals(datum.get("COL2"),"D","datum parameter for COL2 is incorrect"); + Assert.assertEquals(datum.get("COL3"),"E","datum parameter for COL3 is incorrect"); + } + + /** + * Make sure multiple bindings are visible from the view. + */ + @Test + public void testMultipleBinding() { + File file = new File(privateTestDir + "TabularDataTest.dat"); + + RMDTriplet testTriplet1 = new RMDTriplet("tableTest1","Table",file.getAbsolutePath(),RMDStorageType.FILE,new Tags()); + ReferenceOrderedDataSource dataSource1 = new ReferenceOrderedDataSource(testTriplet1,builder,seq.getSequenceDictionary(),genomeLocParser,false); + + RMDTriplet testTriplet2 = new RMDTriplet("tableTest2","Table",file.getAbsolutePath(),RMDStorageType.FILE,new Tags()); + ReferenceOrderedDataSource dataSource2 = new ReferenceOrderedDataSource(testTriplet2,builder,seq.getSequenceDictionary(),genomeLocParser,false); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chrM",1,30))); + + LocusShardDataProvider provider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, seq, Arrays.asList(dataSource1,dataSource2)); + ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); + + RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20)); + TableFeature datum1 = tracker.getFirstValue(new RodBinding(TableFeature.class, "tableTest1")); + + Assert.assertEquals(datum1.get("COL1"),"C","datum1 parameter for COL1 is incorrect"); + Assert.assertEquals(datum1.get("COL2"),"D","datum1 parameter for COL2 is incorrect"); + Assert.assertEquals(datum1.get("COL3"),"E","datum1 parameter for COL3 is incorrect"); + + TableFeature datum2 = tracker.getFirstValue(new RodBinding(TableFeature.class, "tableTest2")); + + Assert.assertEquals(datum2.get("COL1"),"C","datum2 parameter for COL1 is incorrect"); + Assert.assertEquals(datum2.get("COL2"),"D","datum2 parameter for COL2 is incorrect"); + Assert.assertEquals(datum2.get("COL3"),"E","datum2 parameter for COL3 is incorrect"); + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceViewTemplate.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceViewTemplate.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceViewTemplate.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceViewTemplate.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProviderUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProviderUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProviderUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProviderUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ActiveRegionShardBalancerUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ActiveRegionShardBalancerUnitTest.java new file mode 100644 index 000000000..258e61b49 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ActiveRegionShardBalancerUnitTest.java @@ -0,0 +1,103 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMFileSpan; +import htsjdk.samtools.SAMSequenceRecord; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.FileNotFoundException; +import java.util.*; + +public class ActiveRegionShardBalancerUnitTest extends BaseTest { + // example genome loc parser for this test, can be deleted if you don't use the reference + private GenomeLocParser genomeLocParser; + protected SAMDataSource readsDataSource; + + @BeforeClass + public void setup() throws FileNotFoundException { + // sequence + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(10, 0, 10000); + genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + readsDataSource = null; + } + + @Test + public void testMergingManyContigs() { + executeTest(genomeLocParser.getContigs().getSequences()); + } + + @Test + public void testMergingAllPointersOnSingleContig() { + executeTest(Arrays.asList(genomeLocParser.getContigs().getSequences().get(1))); + } + + @Test + public void testMergingMultipleDiscontinuousContigs() { + final List all = genomeLocParser.getContigs().getSequences(); + executeTest(Arrays.asList(all.get(1), all.get(3))); + } + + private void executeTest(final Collection records) { + final ActiveRegionShardBalancer balancer = new ActiveRegionShardBalancer(); + + final List> expectedLocs = new LinkedList<>(); + final List pointers = new LinkedList<>(); + + for ( final SAMSequenceRecord record : records ) { + final int size = 10; + int end = 0; + for ( int i = 0; i < record.getSequenceLength(); i += size) { + final int myEnd = i + size - 1; + end = myEnd; + final GenomeLoc loc = genomeLocParser.createGenomeLoc(record.getSequenceName(), i, myEnd); + final Map fileSpans = Collections.emptyMap(); + final FilePointer fp = new FilePointer(fileSpans, IntervalMergingRule.ALL, Collections.singletonList(loc)); + pointers.add(fp); + } + expectedLocs.add(Collections.singleton(genomeLocParser.createGenomeLoc(record.getSequenceName(), 0, end))); + } + + balancer.initialize(readsDataSource, pointers.iterator(), genomeLocParser); + + int i = 0; + int nShardsFound = 0; + for ( final Shard shard : balancer ) { + nShardsFound++; + Assert.assertEquals(new HashSet<>(shard.getGenomeLocs()), expectedLocs.get(i++)); + } + Assert.assertEquals(nShardsFound, records.size(), "Didn't find exactly one shard for each contig in the sequence dictionary"); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/DownsamplerBenchmark.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/DownsamplerBenchmark.java new file mode 100644 index 000000000..7df9bc2cb --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/DownsamplerBenchmark.java @@ -0,0 +1,94 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import com.google.caliper.Param; +import org.broadinstitute.gatk.engine.WalkerManager; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingMethod; +import org.broadinstitute.gatk.engine.walkers.LocusWalker; + +/** + * Created by IntelliJ IDEA. + * User: mhanna + * Date: Apr 22, 2011 + * Time: 4:02:56 PM + * To change this template use File | Settings | File Templates. + */ +public class DownsamplerBenchmark extends ReadProcessingBenchmark { + @Param + private String bamFile; + + @Param + private Integer maxReads; + + @Override + public String getBAMFile() { return bamFile; } + + @Override + public Integer getMaxReads() { return maxReads; } + + @Param + private Downsampling downsampling; + +// public void timeDownsampling(int reps) { +// for(int i = 0; i < reps; i++) { +// SAMFileReader reader = new SAMFileReader(inputFile); +// ReadProperties readProperties = new ReadProperties(Collections.singletonList(new SAMReaderID(inputFile,new Tags())), +// reader.getFileHeader(), +// SAMFileHeader.SortOrder.coordinate, +// false, +// SAMFileReader.ValidationStringency.SILENT, +// downsampling.create(), +// new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALL)), +// Collections.emptyList(), +// Collections.emptyList(), +// false, +// (byte)0, +// false); +// +// GenomeLocParser genomeLocParser = new GenomeLocParser(reader.getFileHeader().getSequenceDictionary()); +// // Filter unmapped reads. TODO: is this always strictly necessary? Who in the GATK normally filters these out? +// Iterator readIterator = new FilteringIterator(reader.iterator(),new UnmappedReadFilter()); +// LegacyLocusIteratorByState locusIteratorByState = new LegacyLocusIteratorByState(readIterator,readProperties,genomeLocParser, LegacyLocusIteratorByState.sampleListForSAMWithoutReadGroups()); +// while(locusIteratorByState.hasNext()) { +// locusIteratorByState.next().getLocation(); +// } +// reader.close(); +// } +// } + + private enum Downsampling { + NONE { + @Override + DownsamplingMethod create() { return DownsamplingMethod.NONE; } + }, + PER_SAMPLE { + @Override + DownsamplingMethod create() { return WalkerManager.getDownsamplingMethod(LocusWalker.class); } + }; + abstract DownsamplingMethod create(); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointerUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointerUnitTest.java new file mode 100644 index 000000000..a54237bfb --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointerUnitTest.java @@ -0,0 +1,130 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.GATKBAMFileSpan; +import htsjdk.samtools.GATKChunk; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; + +/** + * + */ +public class FilePointerUnitTest extends BaseTest { + private IndexedFastaSequenceFile seq; + private GenomeLocParser genomeLocParser; + private SAMReaderID readerID = new SAMReaderID("samFile",new Tags()); + + /** + * This function does the setup of our parser, before each method call. + *

    + * Called before every test case method. + */ + @BeforeMethod + public void doForEachTest() throws FileNotFoundException { + // sequence + seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); + genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary()); + } + + @Test + public void testFilePointerCombineDisjoint() { + FilePointer one = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,5)); + one.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,1))); + FilePointer two = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",6,10)); + two.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(1,2))); + + FilePointer result = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,10)); + result.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,2))); + + Assert.assertEquals(one.combine(genomeLocParser,two),result,"Combination of two file pointers is incorrect"); + Assert.assertEquals(two.combine(genomeLocParser,one),result,"Combination of two file pointers is incorrect"); + + //Now test that adjacent (but disjoint) intervals are properly handled with OVERLAPPING_ONLY + one = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY, genomeLocParser.createGenomeLoc("chr1",1,5)); + one.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,1))); + two = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY, genomeLocParser.createGenomeLoc("chr1",6,10)); + two.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(1,2))); + + result = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY, + genomeLocParser.createGenomeLoc("chr1",1,5), + genomeLocParser.createGenomeLoc("chr1",6,10)); + result.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,2))); + + Assert.assertEquals(one.combine(genomeLocParser,two),result,"Combination of two file pointers is incorrect"); + Assert.assertEquals(two.combine(genomeLocParser,one),result,"Combination of two file pointers is incorrect"); + } + + @Test + public void testFilePointerCombineJoint() { + FilePointer one = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,5)); + one.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,2))); + FilePointer two = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",2,6)); + two.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(1,3))); + + FilePointer result = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,6)); + result.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,3))); + + Assert.assertEquals(one.combine(genomeLocParser,two),result,"Combination of two file pointers is incorrect"); + Assert.assertEquals(two.combine(genomeLocParser,one),result,"Combination of two file pointers is incorrect"); + + //Repeat the tests for OVERLAPPING_ONLY + one = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY, genomeLocParser.createGenomeLoc("chr1",1,5)); + one.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,2))); + two = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY, genomeLocParser.createGenomeLoc("chr1",2,6)); + two.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(1,3))); + + result = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY, genomeLocParser.createGenomeLoc("chr1",1,6)); + result.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,3))); + + Assert.assertEquals(one.combine(genomeLocParser,two),result,"Combination of two file pointers is incorrect"); + Assert.assertEquals(two.combine(genomeLocParser,one),result,"Combination of two file pointers is incorrect"); + } + + @Test + public void testFilePointerCombineOneSided() { + FilePointer filePointer = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,5)); + filePointer.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,1))); + FilePointer empty = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",6,10)); + // Do not add file spans to empty result + + FilePointer result = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,10)); + result.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,1))); + Assert.assertEquals(filePointer.combine(genomeLocParser,empty),result,"Combination of two file pointers is incorrect"); + Assert.assertEquals(empty.combine(genomeLocParser,filePointer),result,"Combination of two file pointers is incorrect"); + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKWalkerBenchmark.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKWalkerBenchmark.java new file mode 100644 index 000000000..aa66d6636 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKWalkerBenchmark.java @@ -0,0 +1,156 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import com.google.caliper.Param; +import org.broadinstitute.gatk.engine.walkers.*; +import org.broadinstitute.gatk.utils.commandline.Output; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.engine.filters.ReadFilter; +import org.broadinstitute.gatk.engine.filters.UnmappedReadFilter; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet; +import org.broadinstitute.gatk.utils.classloader.JVMUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; + +import java.io.File; +import java.io.PrintStream; +import java.util.Collections; + +/** + * Created by IntelliJ IDEA. + * User: mhanna + * Date: Feb 25, 2011 + * Time: 10:16:54 AM + * To change this template use File | Settings | File Templates. + */ +public class GATKWalkerBenchmark extends ReadProcessingBenchmark { + @Param + private String bamFile; + + @Param + private Integer maxReads; + + @Param + private String referenceFile; + + @Param + private WalkerType walkerType; + + @Override + public String getBAMFile() { return bamFile; } + + @Override + public Integer getMaxReads() { return maxReads; } + + @Override + public void setUp() { + super.setUp(); + } + + public void timeWalkerPerformance(final int reps) { + for(int i = 0; i < reps; i++) { + GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + + // Establish the argument collection + GATKArgumentCollection argCollection = new GATKArgumentCollection(); + argCollection.referenceFile = new File(referenceFile); + argCollection.samFiles = Collections.singletonList(inputFile.getAbsolutePath()); + + engine.setArguments(argCollection); + // Bugs in the engine mean that this has to be set twice. + engine.setSAMFileIDs(Collections.singletonList(new SAMReaderID(inputFile,new Tags()))); + engine.setFilters(Collections.singletonList(new UnmappedReadFilter())); + engine.setReferenceMetaDataFiles(Collections.emptyList()); + + // Create the walker + engine.setWalker(walkerType.create()); + + engine.execute(); + } + } + + private enum WalkerType { + COUNT_READS { + @Override + Walker create() { return new CountReadsPerformanceWalker(); } + }, + COUNT_BASES_IN_READ { + @Override + Walker create() { return new CountBasesInReadPerformanceWalker(); } + }, + COUNT_LOCI { + @Override + Walker create() { + CountLociPerformanceWalker walker = new CountLociPerformanceWalker(); + JVMUtils.setFieldValue(JVMUtils.findField(CountLociPerformanceWalker.class,"out"),walker,System.out); + return walker; + } + }; + abstract Walker create(); + } +} + +class CountLociPerformanceWalker extends TestCountLociWalker { + // NOTE: Added this output during porting. Previous version of test was reaching out of engine + // and into production o.b.g.tools.walkers.qc.CountLoci. + @Output + PrintStream out; + + @Override + public void onTraversalDone(Long result) { + out.println(result); + } +} + +class CountReadsPerformanceWalker extends TestCountReadsWalker { +} + +class CountBasesInReadPerformanceWalker extends ReadWalker { + private long As; + private long Cs; + private long Gs; + private long Ts; + + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { + for(byte base: read.getReadBases()) { + switch(base) { + case 'A': As++; break; + case 'C': Cs++; break; + case 'G': Gs++; break; + case 'T': Ts++; break; + } + } + return 1; + } + + public Long reduceInit() { return 0L; } + public Long reduce(Integer value, Long accum) { return value + accum; } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalOverlapFilteringIteratorUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalOverlapFilteringIteratorUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalOverlapFilteringIteratorUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalOverlapFilteringIteratorUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/MockLocusShard.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/MockLocusShard.java new file mode 100644 index 000000000..bed203b3d --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/MockLocusShard.java @@ -0,0 +1,49 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; + +import java.util.List; +import java.util.Collections; + +/** + * A mock locus shard, usable for infrastructure that requires a shard to behave properly. + * + * @author mhanna + * @version 0.1 + */ +public class MockLocusShard extends LocusShard { + public MockLocusShard(final GenomeLocParser genomeLocParser,final List intervals) { + super( genomeLocParser, + new SAMDataSource(null, Collections.emptyList(),new ThreadAllocation(),null,genomeLocParser), + intervals, + null); + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/PicardBaselineBenchmark.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/PicardBaselineBenchmark.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/PicardBaselineBenchmark.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/PicardBaselineBenchmark.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ReadProcessingBenchmark.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ReadProcessingBenchmark.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ReadProcessingBenchmark.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ReadProcessingBenchmark.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShardBalancerUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShardBalancerUnitTest.java new file mode 100644 index 000000000..c4f6159a1 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShardBalancerUnitTest.java @@ -0,0 +1,197 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.*; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingMethod; +import org.broadinstitute.gatk.engine.filters.ReadFilter; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.ArtificialSingleSampleReadStream; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; + +public class ReadShardBalancerUnitTest extends BaseTest { + + /** + * Tests to ensure that ReadShardBalancer works as expected and does not place shard boundaries + * at inappropriate places, such as within an alignment start position + */ + private static class ReadShardBalancerTest extends TestDataProvider { + private int numContigs; + private int numStacksPerContig; + private int stackSize; + private int numUnmappedReads; + private DownsamplingMethod downsamplingMethod; + private int expectedReadCount; + + private SAMFileHeader header; + private SAMReaderID testBAM; + + public ReadShardBalancerTest( int numContigs, + int numStacksPerContig, + int stackSize, + int numUnmappedReads, + int downsamplingTargetCoverage ) { + super(ReadShardBalancerTest.class); + + this.numContigs = numContigs; + this.numStacksPerContig = numStacksPerContig; + this.stackSize = stackSize; + this.numUnmappedReads = numUnmappedReads; + + this.downsamplingMethod = new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsamplingTargetCoverage, null); + this.expectedReadCount = Math.min(stackSize, downsamplingTargetCoverage) * numStacksPerContig * numContigs + numUnmappedReads; + + setName(String.format("%s: numContigs=%d numStacksPerContig=%d stackSize=%d numUnmappedReads=%d downsamplingTargetCoverage=%d", + getClass().getSimpleName(), numContigs, numStacksPerContig, stackSize, numUnmappedReads, downsamplingTargetCoverage)); + } + + public void run() { + createTestBAM(); + + SAMDataSource dataSource = new SAMDataSource(null, // Reference not used in this test. + Arrays.asList(testBAM), + new ThreadAllocation(), + null, + new GenomeLocParser(header.getSequenceDictionary()), + false, + ValidationStringency.SILENT, + ReadShard.DEFAULT_MAX_READS, // reset ReadShard.MAX_READS to ReadShard.DEFAULT_MAX_READS for each test + downsamplingMethod, + new ValidationExclusion(), + new ArrayList(), + false); + + Iterable shardIterator = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); + + SAMRecord readAtEndOfLastShard = null; + int totalReadsSeen = 0; + + for ( Shard shard : shardIterator ) { + int numContigsThisShard = 0; + SAMRecord lastRead = null; + + for ( SAMRecord read : shard.iterator() ) { + totalReadsSeen++; + + if ( lastRead == null ) { + numContigsThisShard = 1; + } + else if ( ! read.getReadUnmappedFlag() && ! lastRead.getReferenceIndex().equals(read.getReferenceIndex()) ) { + numContigsThisShard++; + } + + // If the last read from the previous shard is not unmapped, we have to make sure + // that no reads in this shard start at the same position + if ( readAtEndOfLastShard != null && ! readAtEndOfLastShard.getReadUnmappedFlag() ) { + Assert.assertFalse(readAtEndOfLastShard.getReferenceIndex().equals(read.getReferenceIndex()) && + readAtEndOfLastShard.getAlignmentStart() == read.getAlignmentStart(), + String.format("Reads from alignment start position %d:%d are split across multiple shards", + read.getReferenceIndex(), read.getAlignmentStart())); + } + + lastRead = read; + } + + // There should never be reads from more than 1 contig in a shard (ignoring unmapped reads) + Assert.assertTrue(numContigsThisShard == 1, "found a shard with reads from multiple contigs"); + + readAtEndOfLastShard = lastRead; + } + + Assert.assertEquals(totalReadsSeen, expectedReadCount, "did not encounter the expected number of reads"); + } + + private void createTestBAM() { + header = ArtificialSAMUtils.createArtificialSamHeader(numContigs, 1, 100000); + SAMReadGroupRecord readGroup = new SAMReadGroupRecord("foo"); + readGroup.setSample("testSample"); + header.addReadGroup(readGroup); + ArtificialSingleSampleReadStream artificialReads = new ArtificialSingleSampleReadStream(header, + "foo", + numContigs, + numStacksPerContig, + stackSize, + stackSize, + 1, + 100, + 50, + 150, + numUnmappedReads); + + final File testBAMFile = createTempFile("SAMDataSourceFillShardBoundaryTest", ".bam"); + + SAMFileWriter bamWriter = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(header, true, testBAMFile); + for ( SAMRecord read : artificialReads ) { + bamWriter.addAlignment(read); + } + bamWriter.close(); + + testBAM = new SAMReaderID(testBAMFile, new Tags()); + + new File(testBAM.getSamFilePath().replace(".bam", ".bai")).deleteOnExit(); + new File(testBAM.getSamFilePath() + ".bai").deleteOnExit(); + } + } + + @DataProvider(name = "ReadShardBalancerTestDataProvider") + public Object[][] createReadShardBalancerTests() { + for ( int numContigs = 1; numContigs <= 3; numContigs++ ) { + for ( int numStacksPerContig : Arrays.asList(1, 2, 4) ) { + // Use crucial read shard boundary values as the stack sizes + for ( int stackSize : Arrays.asList(ReadShard.DEFAULT_MAX_READS / 2, ReadShard.DEFAULT_MAX_READS / 2 + 10, ReadShard.DEFAULT_MAX_READS, ReadShard.DEFAULT_MAX_READS - 1, ReadShard.DEFAULT_MAX_READS + 1, ReadShard.DEFAULT_MAX_READS * 2) ) { + for ( int numUnmappedReads : Arrays.asList(0, ReadShard.DEFAULT_MAX_READS / 2, ReadShard.DEFAULT_MAX_READS * 2) ) { + // The first value will result in no downsampling at all, the others in some downsampling + for ( int downsamplingTargetCoverage : Arrays.asList(ReadShard.DEFAULT_MAX_READS * 10, ReadShard.DEFAULT_MAX_READS, ReadShard.DEFAULT_MAX_READS / 2) ) { + new ReadShardBalancerTest(numContigs, numStacksPerContig, stackSize, numUnmappedReads, downsamplingTargetCoverage); + } + } + } + } + } + + return ReadShardBalancerTest.getTests(ReadShardBalancerTest.class); + } + + @Test(dataProvider = "ReadShardBalancerTestDataProvider") + public void runReadShardBalancerTest( ReadShardBalancerTest test ) { + logger.warn("Running test: " + test); + + test.run(); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSourceUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSourceUnitTest.java new file mode 100644 index 000000000..8be72a22c --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSourceUnitTest.java @@ -0,0 +1,268 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.*; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.engine.filters.ReadFilter; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import static org.testng.Assert.*; + +/** + *

    + * Class SAMDataSourceUnitTest + *

    + * The test of the SAMBAM simple data source. + */ +public class SAMDataSourceUnitTest extends BaseTest { + + // TODO: These legacy tests should really be replaced with a more comprehensive suite of tests for SAMDataSource + + private List readers; + private File referenceFile; + private IndexedFastaSequenceFile seq; + private GenomeLocParser genomeLocParser; + + /** + * This function does the setup of our parser, before each method call. + *

    + * Called before every test case method. + */ + @BeforeMethod + public void doForEachTest() throws FileNotFoundException { + readers = new ArrayList(); + + // sequence + referenceFile = new File(b36KGReference); + seq = new CachingIndexedFastaSequenceFile(referenceFile); + genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary()); + } + + /** + * Tears down the test fixture after each call. + *

    + * Called after every test case method. + */ + @AfterMethod + public void undoForEachTest() { + seq = null; + readers.clear(); + } + + + /** Test out that we can shard the file and iterate over every read */ + @Test + public void testLinearBreakIterateAll() { + logger.warn("Executing testLinearBreakIterateAll"); + + // setup the data + readers.add(new SAMReaderID(new File(validationDataLocation+"/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); + + // the sharding strat. + SAMDataSource data = new SAMDataSource( + referenceFile, + readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + false); + + Iterable strat = data.createShardIteratorOverMappedReads(new LocusShardBalancer()); + int count = 0; + + try { + for (Shard sh : strat) { + int readCount = 0; + count++; + + GenomeLoc firstLocus = sh.getGenomeLocs().get(0), lastLocus = sh.getGenomeLocs().get(sh.getGenomeLocs().size()-1); + logger.debug("Start : " + firstLocus.getStart() + " stop : " + lastLocus.getStop() + " contig " + firstLocus.getContig()); + logger.debug("count = " + count); + GATKSAMIterator datum = data.seek(sh); + + // for the first couple of shards make sure we can see the reads + if (count < 5) { + for (SAMRecord r : datum) { + } + readCount++; + } + datum.close(); + + // if we're over 100 shards, break out + if (count > 100) { + break; + } + } + } + catch (UserException.CouldNotReadInputFile e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception"); + } + } + + /** Test that we clear program records when requested */ + @Test + public void testRemoveProgramRecords() { + logger.warn("Executing testRemoveProgramRecords"); + + // setup the data + readers.add(new SAMReaderID(new File(b37GoodBAM),new Tags())); + + // use defaults + SAMDataSource data = new SAMDataSource( + referenceFile, + readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + false); + + List defaultProgramRecords = data.getHeader().getProgramRecords(); + assertTrue(defaultProgramRecords.size() != 0, "testRemoveProgramRecords: No program records found when using default constructor"); + + boolean removeProgramRecords = false; + data = new SAMDataSource( + referenceFile, + readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + Collections.emptyList(), + false, + (byte) -1, + removeProgramRecords, + false, + null, IntervalMergingRule.ALL); + + List dontRemoveProgramRecords = data.getHeader().getProgramRecords(); + assertEquals(dontRemoveProgramRecords, defaultProgramRecords, "testRemoveProgramRecords: default program records differ from removeProgramRecords = false"); + + removeProgramRecords = true; + data = new SAMDataSource( + referenceFile, + readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + Collections.emptyList(), + false, + (byte) -1, + removeProgramRecords, + false, + null, IntervalMergingRule.ALL); + + List doRemoveProgramRecords = data.getHeader().getProgramRecords(); + assertTrue(doRemoveProgramRecords.isEmpty(), "testRemoveProgramRecords: program records not cleared when removeProgramRecords = true"); + } + + @Test(expectedExceptions = UserException.class) + public void testFailOnReducedReads() { + readers.add(new SAMReaderID(new File(privateTestDir + "old.reduced.bam"), new Tags())); + + SAMDataSource data = new SAMDataSource( + referenceFile, + readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + false); + } + + @Test(expectedExceptions = UserException.class) + public void testFailOnReducedReadsRemovingProgramRecords() { + readers.add(new SAMReaderID(new File(privateTestDir + "old.reduced.bam"), new Tags())); + + SAMDataSource data = new SAMDataSource( + referenceFile, + readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + Collections.emptyList(), + false, + (byte) -1, + true, + false, + null, IntervalMergingRule.ALL); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMReaderIDUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMReaderIDUnitTest.java new file mode 100644 index 000000000..c975fb166 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMReaderIDUnitTest.java @@ -0,0 +1,50 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; + +public class SAMReaderIDUnitTest extends BaseTest { + + @Test + public void testSAMReaderIDHashingAndEquality() { + // Test to make sure that two SAMReaderIDs that point at the same file via an absolute vs. relative + // path are equal according to equals() and have the same hash code + final File relativePathToBAMFile = new File(publicTestDir + "exampleBAM.bam"); + final File absolutePathToBAMFile = new File(relativePathToBAMFile.getAbsolutePath()); + final SAMReaderID relativePathSAMReaderID = new SAMReaderID(relativePathToBAMFile, new Tags()); + final SAMReaderID absolutePathSAMReaderID = new SAMReaderID(absolutePathToBAMFile, new Tags()); + + Assert.assertEquals(relativePathSAMReaderID, absolutePathSAMReaderID, "Absolute-path and relative-path SAMReaderIDs not equal according to equals()"); + Assert.assertEquals(relativePathSAMReaderID.hashCode(), absolutePathSAMReaderID.hashCode(), "Absolute-path and relative-path SAMReaderIDs have different hash codes"); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SeekableBufferedStreamUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SeekableBufferedStreamUnitTest.java new file mode 100644 index 000000000..c67cadb8d --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SeekableBufferedStreamUnitTest.java @@ -0,0 +1,104 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.seekablestream.SeekableBufferedStream; +import htsjdk.samtools.seekablestream.SeekableFileStream; +import org.broadinstitute.gatk.utils.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; + +/** + * Test basic functionality in SeekableBufferedStream. + */ +public class SeekableBufferedStreamUnitTest extends BaseTest { + private static File InputFile = new File(validationDataLocation + "megabyteZeros.dat"); + + final private int BUFFERED_STREAM_BUFFER_SIZE = 100; + private byte buffer[] = new byte[BUFFERED_STREAM_BUFFER_SIZE * 10]; + + + @DataProvider(name = "BasicArgumentsDivisible") + public Integer[][] DivisableReads() { + return new Integer[][]{{1}, {4}, {5}, {10}, {20}, {50}, {100}}; + } + + @DataProvider(name = "BasicArgumentsIndivisibleAndSmall") + public Integer[][] InDivisableReadsSmall() { + return new Integer[][]{{3}, {11}, {31}, {51}, {77}, {99}}; + } + + @DataProvider(name = "BasicArgumentsIndivisibleYetLarge") + public Integer[][] InDivisableReadsLarge() { + return new Integer[][]{{101}, {151}, {205}, {251}, {301}}; + } + + + private void testReadsLength(int length) throws IOException { + final int READ_SIZE=100000; //file is 10^6, so make this smaller to be safe. + + SeekableFileStream fileStream = new SeekableFileStream(InputFile); + SeekableBufferedStream bufferedStream = new SeekableBufferedStream(fileStream, BUFFERED_STREAM_BUFFER_SIZE); + + for (int i = 0; i < READ_SIZE / length; ++i) { + Assert.assertEquals(bufferedStream.read(buffer, 0, length), length); + } + + } + + // These tests fail because SeekableBuffered stream may return _less_ than the amount you are asking for. + // make sure that you wrap reads with while-loops. If these test start failing (meaning that the reads work properly, + // the layer of protection built into GATKBamIndex can be removed. + // + // pdexheimer, Jan 2015 - SeekableBufferedStream no longer returns less than the expected amount. + // Renaming testIndivisableSmallReadsFAIL to testIndivisableSmallReadsPASS and removing the expected exception + // If this bug regresses, the while loop will need to be re-introduced into GATKBamIndex.read() + + @Test(dataProvider = "BasicArgumentsIndivisibleAndSmall", enabled = true) + public void testIndivisableSmallReadsPASS(Integer readLength) throws IOException { + testReadsLength(readLength); + } + + //Evidently, if you ask for a read length that's larger than the inernal buffer, + //SeekableBufferedStreamdoes something else and gives you what you asked for + + @Test(dataProvider = "BasicArgumentsIndivisibleYetLarge", enabled = true) + public void testIndivisableLargeReadsPASS(Integer readLength) throws IOException { + testReadsLength(readLength); + } + + // if the readlength divides the buffer, there are no failures + @Test(dataProvider = "BasicArgumentsDivisible", enabled = true) + public void testDivisableReadsPASS(Integer readLength) throws IOException { + testReadsLength(readLength); + } + + +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/TheoreticalMinimaBenchmark.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/TheoreticalMinimaBenchmark.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/TheoreticalMinimaBenchmark.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/TheoreticalMinimaBenchmark.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSourceIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSourceIntegrationTest.java new file mode 100644 index 000000000..a544d716a --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSourceIntegrationTest.java @@ -0,0 +1,75 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reference; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.io.File; +import java.io.IOException; + +public class ReferenceDataSourceIntegrationTest extends WalkerTest { + + @Test + public void testReferenceWithMissingFaiFile() throws IOException { + final File dummyReference = createTempFile("dummy", ".fasta"); + final File dictFile = new File(dummyReference.getAbsolutePath().replace(".fasta", ".dict")); + dictFile.deleteOnExit(); + Assert.assertTrue(dictFile.createNewFile()); + + final WalkerTestSpec spec = new WalkerTestSpec( + " -T TestPrintReadsWalker" + + " -R " + dummyReference.getAbsolutePath() + + " -I " + privateTestDir + "NA12878.4.snippet.bam" + + " -o %s", + 1, + UserException.MissingReferenceFaiFile.class + ); + + executeTest("testReferenceWithMissingFaiFile", spec); + } + + @Test + public void testReferenceWithMissingDictFile() throws IOException { + final File dummyReference = createTempFile("dummy", ".fasta"); + final File faiFile = new File(dummyReference.getAbsolutePath() + ".fai"); + faiFile.deleteOnExit(); + Assert.assertTrue(faiFile.createNewFile()); + + final WalkerTestSpec spec = new WalkerTestSpec( + " -T TestPrintReadsWalker" + + " -R " + dummyReference.getAbsolutePath() + + " -I " + privateTestDir + "NA12878.4.snippet.bam" + + " -o %s", + 1, + UserException.MissingReferenceDictFile.class + ); + + executeTest("testReferenceWithMissingDictFile", spec); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java new file mode 100644 index 000000000..a77c0961c --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java @@ -0,0 +1,208 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.rmd; + +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.utils.refdata.tracks.RMDTrackBuilder; +import org.testng.Assert; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.codecs.table.TableFeature; +import org.broadinstitute.gatk.utils.refdata.utils.LocationAwareSeekableRODIterator; +import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet; +import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet.RMDStorageType; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; + +import static org.testng.Assert.assertTrue; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +/** + * User: hanna + * Date: May 21, 2009 + * Time: 11:03:04 AM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * Test the contents and number of iterators in the pool. + */ + +public class ReferenceOrderedDataPoolUnitTest extends BaseTest { + + private RMDTriplet triplet = null; + private RMDTrackBuilder builder = null; + + private IndexedFastaSequenceFile seq; + private GenomeLocParser genomeLocParser; + + private GenomeLoc testSite1; + private GenomeLoc testSite2; + private GenomeLoc testSite3; + + private GenomeLoc testInterval1; // an interval matching testSite1 -> testSite2 for queries + private GenomeLoc testInterval2; // an interval matching testSite2 -> testSite3 for queries + + + @BeforeClass + public void init() throws FileNotFoundException { + seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); + genomeLocParser = new GenomeLocParser(seq); + + testSite1 = genomeLocParser.createGenomeLoc("chrM",10); + testSite2 = genomeLocParser.createGenomeLoc("chrM",20); + testSite3 = genomeLocParser.createGenomeLoc("chrM",30); + testInterval1 = genomeLocParser.createGenomeLoc("chrM",10,20); + testInterval2 = genomeLocParser.createGenomeLoc("chrM",20,30); + } + + @BeforeMethod + public void setUp() { + String fileName = privateTestDir + "TabularDataTest.dat"; + + triplet = new RMDTriplet("tableTest","Table",fileName,RMDStorageType.FILE,new Tags()); + // disable auto-index creation/locking in the RMDTrackBuilder for tests + builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null,true,null); + } + + @Test + public void testCreateSingleIterator() { + ResourcePool iteratorPool = new ReferenceOrderedDataPool(triplet,builder,seq.getSequenceDictionary(),genomeLocParser,false); + LocationAwareSeekableRODIterator iterator = (LocationAwareSeekableRODIterator)iteratorPool.iterator( new MappedStreamSegment(testSite1) ); + + Assert.assertEquals(iteratorPool.numIterators(), 1, "Number of iterators in the pool is incorrect"); + Assert.assertEquals(iteratorPool.numAvailableIterators(), 0, "Number of available iterators in the pool is incorrect"); + + TableFeature datum = (TableFeature)iterator.next().get(0).getUnderlyingObject(); + + assertTrue(datum.getLocation().equals(testSite1)); + assertTrue(datum.get("COL1").equals("A")); + assertTrue(datum.get("COL2").equals("B")); + assertTrue(datum.get("COL3").equals("C")); + + iteratorPool.release(iterator); + + Assert.assertEquals(iteratorPool.numIterators(), 1, "Number of iterators in the pool is incorrect"); + Assert.assertEquals(iteratorPool.numAvailableIterators(), 1, "Number of available iterators in the pool is incorrect"); + } + + @Test + public void testCreateMultipleIterators() { + ReferenceOrderedQueryDataPool iteratorPool = new ReferenceOrderedQueryDataPool(triplet,builder,seq.getSequenceDictionary(),genomeLocParser); + LocationAwareSeekableRODIterator iterator1 = iteratorPool.iterator( new MappedStreamSegment(testInterval1) ); + + // Create a new iterator at position 2. + LocationAwareSeekableRODIterator iterator2 = iteratorPool.iterator( new MappedStreamSegment(testInterval2) ); + + Assert.assertEquals(iteratorPool.numIterators(), 2, "Number of iterators in the pool is incorrect"); + Assert.assertEquals(iteratorPool.numAvailableIterators(), 0, "Number of available iterators in the pool is incorrect"); + + // Test out-of-order access: first iterator2, then iterator1. + // Ugh...first call to a region needs to be a seek. + TableFeature datum = (TableFeature)iterator2.seekForward(testSite2).get(0).getUnderlyingObject(); + assertTrue(datum.getLocation().equals(testSite2)); + assertTrue(datum.get("COL1").equals("C")); + assertTrue(datum.get("COL2").equals("D")); + assertTrue(datum.get("COL3").equals("E")); + + datum = (TableFeature)iterator1.next().get(0).getUnderlyingObject(); + assertTrue(datum.getLocation().equals(testSite1)); + assertTrue(datum.get("COL1").equals("A")); + assertTrue(datum.get("COL2").equals("B")); + assertTrue(datum.get("COL3").equals("C")); + + // Advance iterator2, and make sure both iterator's contents are still correct. + datum = (TableFeature)iterator2.next().get(0).getUnderlyingObject(); + assertTrue(datum.getLocation().equals(testSite3)); + assertTrue(datum.get("COL1").equals("F")); + assertTrue(datum.get("COL2").equals("G")); + assertTrue(datum.get("COL3").equals("H")); + + datum = (TableFeature)iterator1.next().get(0).getUnderlyingObject(); + assertTrue(datum.getLocation().equals(testSite2)); + assertTrue(datum.get("COL1").equals("C")); + assertTrue(datum.get("COL2").equals("D")); + assertTrue(datum.get("COL3").equals("E")); + + // Cleanup, and make sure the number of iterators dies appropriately. + iteratorPool.release(iterator1); + + Assert.assertEquals(iteratorPool.numIterators(), 2, "Number of iterators in the pool is incorrect"); + Assert.assertEquals(iteratorPool.numAvailableIterators(), 1, "Number of available iterators in the pool is incorrect"); + + iteratorPool.release(iterator2); + + Assert.assertEquals(iteratorPool.numIterators(), 2, "Number of iterators in the pool is incorrect"); + Assert.assertEquals(iteratorPool.numAvailableIterators(), 2, "Number of available iterators in the pool is incorrect"); + } + + @Test + public void testIteratorConservation() { + ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(triplet,builder,seq.getSequenceDictionary(),genomeLocParser,false); + LocationAwareSeekableRODIterator iterator = iteratorPool.iterator( new MappedStreamSegment(testSite1) ); + + Assert.assertEquals(iteratorPool.numIterators(), 1, "Number of iterators in the pool is incorrect"); + Assert.assertEquals(iteratorPool.numAvailableIterators(), 0, "Number of available iterators in the pool is incorrect"); + + TableFeature datum = (TableFeature)iterator.next().get(0).getUnderlyingObject(); + assertTrue(datum.getLocation().equals(testSite1)); + assertTrue(datum.get("COL1").equals("A")); + assertTrue(datum.get("COL2").equals("B")); + assertTrue(datum.get("COL3").equals("C")); + + iteratorPool.release(iterator); + + // Create another iterator after the current iterator. + iterator = iteratorPool.iterator( new MappedStreamSegment(testSite3) ); + + // Make sure that the previously acquired iterator was reused. + Assert.assertEquals(iteratorPool.numIterators(), 1, "Number of iterators in the pool is incorrect"); + Assert.assertEquals(iteratorPool.numAvailableIterators(), 0, "Number of available iterators in the pool is incorrect"); + + datum = (TableFeature)iterator.seekForward(testSite3).get(0).getUnderlyingObject(); + assertTrue(datum.getLocation().equals(testSite3)); + assertTrue(datum.get("COL1").equals("F")); + assertTrue(datum.get("COL2").equals("G")); + assertTrue(datum.get("COL3").equals("H")); + + iteratorPool.release(iterator); + + Assert.assertEquals(iteratorPool.numIterators(), 1, "Number of iterators in the pool is incorrect"); + Assert.assertEquals(iteratorPool.numAvailableIterators(), 1, "Number of available iterators in the pool is incorrect"); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java new file mode 100644 index 000000000..514b85737 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java @@ -0,0 +1,89 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.rmd; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.utils.refdata.utils.*; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +public class ReferenceOrderedQueryDataPoolUnitTest extends BaseTest{ + @Test + public void testCloseFilePointers() throws IOException { + // Build up query parameters + File file = new File(BaseTest.privateTestDir + "NA12878.hg19.example1.vcf"); + RMDTriplet triplet = new RMDTriplet("test", "VCF", file.getAbsolutePath(), RMDTriplet.RMDStorageType.FILE, new Tags()); + IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.hg19Reference)); + GenomeLocParser parser = new GenomeLocParser(seq); + GenomeLoc loc = parser.createGenomeLoc("20", 1, 100000); + TestRMDTrackBuilder builder = new TestRMDTrackBuilder(seq.getSequenceDictionary(), parser); + + // Create the query data pool + ReferenceOrderedQueryDataPool pool = new ReferenceOrderedQueryDataPool(triplet, builder, seq.getSequenceDictionary(), parser); + + for (int i = 0; i < 3; i++) { + // Ensure our tribble iterators are closed. + CheckableCloseableTribbleIterator.clearThreadIterators(); + Assert.assertTrue(CheckableCloseableTribbleIterator.getThreadIterators().isEmpty(), "Tribble iterators list was not cleared."); + + // Request the the rodIterator + LocationAwareSeekableRODIterator rodIterator = pool.iterator(new MappedStreamSegment(loc)); + + // Run normal iteration over rodIterator + Assert.assertTrue(rodIterator.hasNext(), "Rod iterator does not have a next value."); + GenomeLoc rodIteratorLocation = rodIterator.next().getLocation(); + Assert.assertEquals(rodIteratorLocation.getContig(), "20", "Instead of chr 20 rod iterator was at location " + rodIteratorLocation); + + // Check that the underlying tribbleIterators are still open. + List> tribbleIterators = CheckableCloseableTribbleIterator.getThreadIterators(); + Assert.assertFalse(tribbleIterators.isEmpty(), "Tribble iterators list is empty"); + for (CheckableCloseableTribbleIterator tribbleIterator: tribbleIterators) { + Assert.assertFalse(tribbleIterator.isClosed(), "Tribble iterator is closed but should be still open."); + } + + // Releasing the rodIterator should close the underlying tribbleIterator. + pool.release(rodIterator); + + // Check that the underlying tribbleIterators are now closed. + for (CheckableCloseableTribbleIterator tribbleIterator: tribbleIterators) { + Assert.assertTrue(tribbleIterator.isClosed(), "Tribble iterator is open but should be now closed."); + } + } + + // Extra cleanup. + CheckableCloseableTribbleIterator.clearThreadIterators(); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingIntegrationTest.java new file mode 100644 index 000000000..c98243adc --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingIntegrationTest.java @@ -0,0 +1,45 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.downsampling; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingMethod; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.annotations.Test; + +public class DownsamplingIntegrationTest extends WalkerTest { + + @Test + public void testDetectLowDcovValueWithLocusTraversal() { + final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker -R " + publicTestDir + "exampleFASTA.fasta -I " + publicTestDir + "exampleBAM.bam -o %s " + + "-dcov " + (DownsamplingMethod.MINIMUM_SAFE_COVERAGE_TARGET_FOR_LOCUS_BASED_TRAVERSALS - 1), + 1, + UserException.class + ); + executeTest("testDetectLowDcovValueWithLocusTraversal", spec); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingReadsIteratorUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingReadsIteratorUnitTest.java new file mode 100644 index 000000000..27804c6d1 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingReadsIteratorUnitTest.java @@ -0,0 +1,141 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.downsampling; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMReadGroupRecord; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingReadsIterator; +import org.broadinstitute.gatk.utils.downsampling.SimplePositionalDownsampler; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.ArtificialSingleSampleReadStream; +import org.broadinstitute.gatk.utils.sam.ArtificialSingleSampleReadStreamAnalyzer; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class DownsamplingReadsIteratorUnitTest extends BaseTest { + + private static class DownsamplingReadsIteratorTest extends TestDataProvider { + private DownsamplingReadsIterator downsamplingIter; + private int targetCoverage; + private ArtificialSingleSampleReadStream stream; + private ArtificialSingleSampleReadStreamAnalyzer streamAnalyzer; + + public DownsamplingReadsIteratorTest( ArtificialSingleSampleReadStream stream, int targetCoverage ) { + super(DownsamplingReadsIteratorTest.class); + + this.stream = stream; + this.targetCoverage = targetCoverage; + + setName(String.format("%s: targetCoverage=%d numContigs=%d stacksPerContig=%d readsPerStack=%d-%d distanceBetweenStacks=%d-%d readLength=%d-%d unmappedReads=%d", + getClass().getSimpleName(), + targetCoverage, + stream.getNumContigs(), + stream.getNumStacksPerContig(), + stream.getMinReadsPerStack(), + stream.getMaxReadsPerStack(), + stream.getMinDistanceBetweenStacks(), + stream.getMaxDistanceBetweenStacks(), + stream.getMinReadLength(), + stream.getMaxReadLength(), + stream.getNumUnmappedReads())); + } + + public void run() { + streamAnalyzer = new PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer(stream, targetCoverage); + downsamplingIter = new DownsamplingReadsIterator(stream.getGATKSAMIterator(), new SimplePositionalDownsampler(targetCoverage)); + + streamAnalyzer.analyze(downsamplingIter); + + // Check whether the observed properties of the downsampled stream are what they should be + streamAnalyzer.validate(); + + // Allow memory used by this test to be reclaimed + stream = null; + streamAnalyzer = null; + downsamplingIter = null; + } + } + + @DataProvider(name = "DownsamplingReadsIteratorTestDataProvider") + public Object[][] createDownsamplingReadsIteratorTests() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(5, 1, 10000); + String readGroupID = "testReadGroup"; + SAMReadGroupRecord readGroup = new SAMReadGroupRecord(readGroupID); + readGroup.setSample("testSample"); + header.addReadGroup(readGroup); + + // Values that don't vary across tests + int targetCoverage = 10; + int minReadLength = 50; + int maxReadLength = 100; + int minDistanceBetweenStacks = 1; + int maxDistanceBetweenStacks = maxReadLength + 1; + + Utils.resetRandomGenerator(); + + // brute force testing! + for ( int numContigs : Arrays.asList(1, 2, 5) ) { + for ( int stacksPerContig : Arrays.asList(1, 2, 10) ) { + for ( int minReadsPerStack : Arrays.asList(1, targetCoverage / 2, targetCoverage, targetCoverage - 1, targetCoverage + 1, targetCoverage * 2) ) { + for ( int maxReadsPerStack : Arrays.asList(1, targetCoverage / 2, targetCoverage, targetCoverage - 1, targetCoverage + 1, targetCoverage * 2) ) { + for ( int numUnmappedReads : Arrays.asList(0, 1, targetCoverage, targetCoverage * 2) ) { + // Only interested in sane read stream configurations here + if ( minReadsPerStack <= maxReadsPerStack ) { + new DownsamplingReadsIteratorTest(new ArtificialSingleSampleReadStream(header, + readGroupID, + numContigs, + stacksPerContig, + minReadsPerStack, + maxReadsPerStack, + minDistanceBetweenStacks, + maxDistanceBetweenStacks, + minReadLength, + maxReadLength, + numUnmappedReads), + targetCoverage); + } + } + } + } + } + } + + return DownsamplingReadsIteratorTest.getTests(DownsamplingReadsIteratorTest.class); + } + + @Test(dataProvider = "DownsamplingReadsIteratorTestDataProvider") + public void runDownsamplingReadsIteratorTest( DownsamplingReadsIteratorTest test ) { + logger.warn("Running test: " + test); + + Utils.resetRandomGenerator(); + test.run(); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/FractionalDownsamplerUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/FractionalDownsamplerUnitTest.java new file mode 100644 index 000000000..8e3ac5f49 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/FractionalDownsamplerUnitTest.java @@ -0,0 +1,160 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.downsampling; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.downsampling.FractionalDownsampler; +import org.broadinstitute.gatk.utils.downsampling.ReadsDownsampler; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +public class FractionalDownsamplerUnitTest extends BaseTest { + + private static class FractionalDownsamplerTest extends TestDataProvider { + double fraction; + int totalReads; + int expectedMinNumReadsAfterDownsampling; + int expectedMaxNumReadsAfterDownsampling; + int expectedMinDiscardedItems; + int expectedMaxDiscardedItems; + + private static final double EXPECTED_ACCURACY = 0.05; // should be accurate to within +/- this percent + + public FractionalDownsamplerTest( double fraction, int totalReads ) { + super(FractionalDownsamplerTest.class); + + this.fraction = fraction; + this.totalReads = totalReads; + + calculateExpectations(); + + setName(String.format("%s: fraction=%.2f totalReads=%d expectedMinNumReadsAfterDownsampling=%d expectedMaxNumReadsAfterDownsampling=%d", + getClass().getSimpleName(), fraction, totalReads, expectedMinNumReadsAfterDownsampling, expectedMaxNumReadsAfterDownsampling)); + } + + private void calculateExpectations() { + // Require an exact match in the 0% and 100% cases + if ( fraction == 0.0 ) { + expectedMinNumReadsAfterDownsampling = expectedMaxNumReadsAfterDownsampling = 0; + expectedMinDiscardedItems = expectedMaxDiscardedItems = totalReads; + } + else if ( fraction == 1.0 ) { + expectedMinNumReadsAfterDownsampling = expectedMaxNumReadsAfterDownsampling = totalReads; + expectedMinDiscardedItems = expectedMaxDiscardedItems = 0; + } + else { + expectedMinNumReadsAfterDownsampling = Math.max((int)((fraction - EXPECTED_ACCURACY) * totalReads), 0); + expectedMaxNumReadsAfterDownsampling = Math.min((int) ((fraction + EXPECTED_ACCURACY) * totalReads), totalReads); + expectedMinDiscardedItems = totalReads - expectedMaxNumReadsAfterDownsampling; + expectedMaxDiscardedItems = totalReads - expectedMinNumReadsAfterDownsampling; + } + } + + public Collection createReads() { + Collection reads = new ArrayList(totalReads); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100)); + + return reads; + } + } + + @DataProvider(name = "FractionalDownsamplerTestDataProvider") + public Object[][] createFractionalDownsamplerTestData() { + for ( double fraction : Arrays.asList(0.0, 0.25, 0.5, 0.75, 1.0) ) { + for ( int totalReads : Arrays.asList(0, 1000, 10000) ) { + new FractionalDownsamplerTest(fraction, totalReads); + } + } + + return FractionalDownsamplerTest.getTests(FractionalDownsamplerTest.class); + } + + @Test(dataProvider = "FractionalDownsamplerTestDataProvider") + public void runFractionalDownsamplerTest( FractionalDownsamplerTest test ) { + logger.warn("Running test: " + test); + + Utils.resetRandomGenerator(); + + ReadsDownsampler downsampler = new FractionalDownsampler(test.fraction); + + downsampler.submit(test.createReads()); + + if ( test.totalReads > 0 ) { + if ( test.fraction > FractionalDownsamplerTest.EXPECTED_ACCURACY ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + } + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.totalReads > 0 ) { + if ( test.fraction > FractionalDownsamplerTest.EXPECTED_ACCURACY ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + } + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List downsampledReads = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + Assert.assertTrue(downsampledReads.size() >= test.expectedMinNumReadsAfterDownsampling && + downsampledReads.size() <= test.expectedMaxNumReadsAfterDownsampling); + + Assert.assertTrue(downsampler.getNumberOfDiscardedItems() >= test.expectedMinDiscardedItems && + downsampler.getNumberOfDiscardedItems() <= test.expectedMaxDiscardedItems); + + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.totalReads - downsampledReads.size()); + + downsampler.resetStats(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/LevelingDownsamplerUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/LevelingDownsamplerUnitTest.java new file mode 100644 index 000000000..74a936782 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/LevelingDownsamplerUnitTest.java @@ -0,0 +1,165 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.downsampling; + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.downsampling.Downsampler; +import org.broadinstitute.gatk.utils.downsampling.LevelingDownsampler; +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; +import org.testng.Assert; + +import java.util.*; + +public class LevelingDownsamplerUnitTest extends BaseTest { + + private static class LevelingDownsamplerUniformStacksTest extends TestDataProvider { + public enum DataStructure { LINKED_LIST, ARRAY_LIST } + + int targetSize; + int numStacks; + int stackSize; + DataStructure dataStructure; + int expectedSize; + + public LevelingDownsamplerUniformStacksTest( int targetSize, int numStacks, int stackSize, DataStructure dataStructure ) { + super(LevelingDownsamplerUniformStacksTest.class); + + this.targetSize = targetSize; + this.numStacks = numStacks; + this.stackSize = stackSize; + this.dataStructure = dataStructure; + expectedSize = calculateExpectedDownsampledStackSize(); + + setName(String.format("%s: targetSize=%d numStacks=%d stackSize=%d dataStructure=%s expectedSize=%d", + getClass().getSimpleName(), targetSize, numStacks, stackSize, dataStructure, expectedSize)); + } + + public Collection> createStacks() { + Collection> stacks = new ArrayList>(); + + for ( int i = 1; i <= numStacks; i++ ) { + List stack = dataStructure == DataStructure.LINKED_LIST ? new LinkedList() : new ArrayList(); + + for ( int j = 1; j <= stackSize; j++ ) { + stack.add(new Object()); + } + + stacks.add(stack); + } + + return stacks; + } + + private int calculateExpectedDownsampledStackSize() { + int numItemsToRemove = numStacks * stackSize - targetSize; + + if ( numStacks == 0 ) { + return 0; + } + else if ( numItemsToRemove <= 0 ) { + return stackSize; + } + + return Math.max(1, stackSize - (numItemsToRemove / numStacks)); + } + } + + @DataProvider(name = "UniformStacksDataProvider") + public Object[][] createUniformStacksTestData() { + for ( int targetSize = 1; targetSize <= 10000; targetSize *= 10 ) { + for ( int numStacks = 0; numStacks <= 10; numStacks++ ) { + for ( int stackSize = 1; stackSize <= 1000; stackSize *= 10 ) { + for ( LevelingDownsamplerUniformStacksTest.DataStructure dataStructure : LevelingDownsamplerUniformStacksTest.DataStructure.values() ) { + new LevelingDownsamplerUniformStacksTest(targetSize, numStacks, stackSize, dataStructure); + } + } + } + } + + return LevelingDownsamplerUniformStacksTest.getTests(LevelingDownsamplerUniformStacksTest.class); + } + + @Test( dataProvider = "UniformStacksDataProvider" ) + public void testLevelingDownsamplerWithUniformStacks( LevelingDownsamplerUniformStacksTest test ) { + logger.warn("Running test: " + test); + + Utils.resetRandomGenerator(); + + Downsampler> downsampler = new LevelingDownsampler, Object>(test.targetSize); + + downsampler.submit(test.createStacks()); + + if ( test.numStacks > 0 ) { + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.numStacks > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + final int sizeFromDownsampler = downsampler.size(); + List> downsampledStacks = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + Assert.assertEquals(downsampledStacks.size(), test.numStacks); + + int totalRemainingItems = 0; + for ( List stack : downsampledStacks ) { + Assert.assertTrue(Math.abs(stack.size() - test.expectedSize) <= 1); + totalRemainingItems += stack.size(); + } + + Assert.assertEquals(sizeFromDownsampler, totalRemainingItems); + int numItemsReportedDiscarded = downsampler.getNumberOfDiscardedItems(); + int numItemsActuallyDiscarded = test.numStacks * test.stackSize - totalRemainingItems; + + Assert.assertEquals(numItemsReportedDiscarded, numItemsActuallyDiscarded); + + downsampler.resetStats(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + + Assert.assertTrue(totalRemainingItems <= Math.max(test.targetSize, test.numStacks)); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java new file mode 100644 index 000000000..fdc8587ba --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java @@ -0,0 +1,302 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.downsampling; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMReadGroupRecord; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.downsampling.PerSampleDownsamplingReadsIterator; +import org.broadinstitute.gatk.utils.downsampling.ReadsDownsamplerFactory; +import org.broadinstitute.gatk.utils.downsampling.SimplePositionalDownsamplerFactory; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.engine.iterators.VerifyingSamIterator; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.sam.ArtificialMultiSampleReadStream; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.ArtificialSingleSampleReadStream; +import org.broadinstitute.gatk.utils.sam.ArtificialSingleSampleReadStreamAnalyzer; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +public class PerSampleDownsamplingReadsIteratorUnitTest extends BaseTest { + + private static class PerSampleDownsamplingReadsIteratorTest extends TestDataProvider { + + // TODO: tests should distinguish between variance across samples and variance within a sample + + private enum StreamDensity { + SPARSE (MAX_READ_LENGTH, MAX_READ_LENGTH * 2), + DENSE (1, MIN_READ_LENGTH), + MIXED (1, MAX_READ_LENGTH * 2), + UNIFORM_DENSE (1, 1), + UNIFORM_SPARSE (MAX_READ_LENGTH * 2, MAX_READ_LENGTH * 2); + + int minDistanceBetweenStacks; + int maxDistanceBetweenStacks; + + StreamDensity( int minDistanceBetweenStacks, int maxDistanceBetweenStacks ) { + this.minDistanceBetweenStacks = minDistanceBetweenStacks; + this.maxDistanceBetweenStacks = maxDistanceBetweenStacks; + } + + public String toString() { + return String.format("StreamDensity:%d-%d", minDistanceBetweenStacks, maxDistanceBetweenStacks); + } + } + + private enum StreamStackDepth { + NON_UNIFORM_LOW (1, 5), + NON_UNIFORM_HIGH (15, 20), + NON_UNIFORM_MIXED (1, 20), + UNIFORM_SINGLE (1, 1), + UNIFORM_LOW (2, 2), + UNIFORM_HIGH (20, 20), + UNIFORM_MEDIUM (10, 10); // should set target coverage to this value for testing + + int minReadsPerStack; + int maxReadsPerStack; + + StreamStackDepth( int minReadsPerStack, int maxReadsPerStack ) { + this.minReadsPerStack = minReadsPerStack; + this.maxReadsPerStack = maxReadsPerStack; + } + + public boolean isUniform() { + return minReadsPerStack == maxReadsPerStack; + } + + public String toString() { + return String.format("StreamStackDepth:%d-%d", minReadsPerStack, maxReadsPerStack); + } + } + + private enum StreamStacksPerContig { + UNIFORM(20, 20), + NON_UNIFORM(1, 30); + + int minStacksPerContig; + int maxStacksPerContig; + + StreamStacksPerContig( int minStacksPerContig, int maxStacksPerContig ) { + this.minStacksPerContig = minStacksPerContig; + this.maxStacksPerContig = maxStacksPerContig; + } + + public boolean isUniform() { + return minStacksPerContig == maxStacksPerContig; + } + + public String toString() { + return String.format("StreamStacksPerContig:%d-%d", minStacksPerContig, maxStacksPerContig); + } + } + + // Not interested in testing multiple ranges for the read lengths, as none of our current + // downsamplers are affected by read length + private static final int MIN_READ_LENGTH = 50; + private static final int MAX_READ_LENGTH = 150; + + private ReadsDownsamplerFactory downsamplerFactory; + private int targetCoverage; + private int numSamples; + private int minContigs; + private int maxContigs; + private StreamDensity streamDensity; + private StreamStackDepth streamStackDepth; + private StreamStacksPerContig streamStacksPerContig; + private double unmappedReadsFraction; + private int unmappedReadsCount; + private boolean verifySortedness; + + private ArtificialMultiSampleReadStream mergedReadStream; + private Map perSampleArtificialReadStreams; + private Map perSampleStreamAnalyzers; + private SAMFileHeader header; + + public PerSampleDownsamplingReadsIteratorTest( ReadsDownsamplerFactory downsamplerFactory, + int targetCoverage, + int numSamples, + int minContigs, + int maxContigs, + StreamDensity streamDensity, + StreamStackDepth streamStackDepth, + StreamStacksPerContig streamStacksPerContig, + double unmappedReadsFraction, + int unmappedReadsCount, + boolean verifySortedness ) { + super(PerSampleDownsamplingReadsIteratorTest.class); + + this.downsamplerFactory = downsamplerFactory; + this.targetCoverage = targetCoverage; + this.numSamples = numSamples; + this.minContigs = minContigs; + this.maxContigs = maxContigs; + this.streamDensity = streamDensity; + this.streamStackDepth = streamStackDepth; + this.streamStacksPerContig = streamStacksPerContig; + this.unmappedReadsFraction = unmappedReadsFraction; + this.unmappedReadsCount = unmappedReadsCount; + this.verifySortedness = verifySortedness; + + header = createHeader(); + createReadStreams(); + + setName(String.format("%s: targetCoverage=%d numSamples=%d minContigs=%d maxContigs=%d %s %s %s unmappedReadsFraction=%.2f unmappedReadsCount=%d verifySortedness=%b", + getClass().getSimpleName(), targetCoverage, numSamples, minContigs, maxContigs, streamDensity, streamStackDepth, streamStacksPerContig, unmappedReadsFraction, unmappedReadsCount, verifySortedness)); + } + + private SAMFileHeader createHeader() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(maxContigs, 1, (streamDensity.maxDistanceBetweenStacks + MAX_READ_LENGTH) * streamStacksPerContig.maxStacksPerContig + 100000); + List readGroups = new ArrayList(numSamples); + List sampleNames = new ArrayList(numSamples); + + for ( int i = 0; i < numSamples; i++ ) { + readGroups.add("ReadGroup" + i); + sampleNames.add("Sample" + i); + } + + return ArtificialSAMUtils.createEnumeratedReadGroups(header, readGroups, sampleNames); + } + + private void createReadStreams() { + perSampleArtificialReadStreams = new HashMap(numSamples); + perSampleStreamAnalyzers = new HashMap(numSamples); + + for (SAMReadGroupRecord readGroup : header.getReadGroups() ) { + String readGroupID = readGroup.getReadGroupId(); + String sampleName = readGroup.getSample(); + + int thisSampleNumContigs = MathUtils.randomIntegerInRange(minContigs, maxContigs); + int thisSampleStacksPerContig = MathUtils.randomIntegerInRange(streamStacksPerContig.minStacksPerContig, streamStacksPerContig.maxStacksPerContig); + + int thisSampleNumUnmappedReads = Utils.getRandomGenerator().nextDouble() < unmappedReadsFraction ? unmappedReadsCount : 0; + + ArtificialSingleSampleReadStream thisSampleStream = new ArtificialSingleSampleReadStream(header, + readGroupID, + thisSampleNumContigs, + thisSampleStacksPerContig, + streamStackDepth.minReadsPerStack, + streamStackDepth.maxReadsPerStack, + streamDensity.minDistanceBetweenStacks, + streamDensity.maxDistanceBetweenStacks, + MIN_READ_LENGTH, + MAX_READ_LENGTH, + thisSampleNumUnmappedReads); + perSampleArtificialReadStreams.put(sampleName, thisSampleStream); + perSampleStreamAnalyzers.put(sampleName, new PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer(thisSampleStream, targetCoverage)); + } + + mergedReadStream = new ArtificialMultiSampleReadStream(perSampleArtificialReadStreams.values()); + } + + public void run() { + GATKSAMIterator downsamplingIter = new PerSampleDownsamplingReadsIterator(mergedReadStream.getGATKSAMIterator(), downsamplerFactory); + + if ( verifySortedness ) { + downsamplingIter = new VerifyingSamIterator(downsamplingIter); + } + + while ( downsamplingIter.hasNext() ) { + SAMRecord read = downsamplingIter.next(); + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + + ArtificialSingleSampleReadStreamAnalyzer analyzer = perSampleStreamAnalyzers.get(sampleName); + if ( analyzer != null ) { + analyzer.update(read); + } + else { + throw new ReviewedGATKException("bug: stream analyzer for sample " + sampleName + " not found"); + } + } + + for ( Map.Entry analyzerEntry : perSampleStreamAnalyzers.entrySet() ) { + ArtificialSingleSampleReadStreamAnalyzer analyzer = analyzerEntry.getValue(); + analyzer.finalizeStats(); + + // Validate the downsampled read stream for each sample individually + analyzer.validate(); + } + + // Allow memory used by this test to be reclaimed: + mergedReadStream = null; + perSampleArtificialReadStreams = null; + perSampleStreamAnalyzers = null; + } + } + + @DataProvider(name = "PerSampleDownsamplingReadsIteratorTestDataProvider") + public Object[][] createPerSampleDownsamplingReadsIteratorTests() { + + Utils.resetRandomGenerator(); + + // Some values don't vary across tests + int targetCoverage = PerSampleDownsamplingReadsIteratorTest.StreamStackDepth.UNIFORM_MEDIUM.minReadsPerStack; + ReadsDownsamplerFactory downsamplerFactory = new SimplePositionalDownsamplerFactory(targetCoverage); + int maxContigs = 3; + boolean verifySortedness = true; + + for ( int numSamples : Arrays.asList(1, 2, 10) ) { + for ( int minContigs = 1; minContigs <= maxContigs; minContigs++ ) { + for ( PerSampleDownsamplingReadsIteratorTest.StreamDensity streamDensity : PerSampleDownsamplingReadsIteratorTest.StreamDensity.values() ) { + for ( PerSampleDownsamplingReadsIteratorTest.StreamStackDepth streamStackDepth : PerSampleDownsamplingReadsIteratorTest.StreamStackDepth.values() ) { + for (PerSampleDownsamplingReadsIteratorTest.StreamStacksPerContig streamStacksPerContig : PerSampleDownsamplingReadsIteratorTest.StreamStacksPerContig.values() ) { + for ( double unmappedReadsFraction : Arrays.asList(0.0, 1.0, 0.5) ) { + for ( int unmappedReadsCount : Arrays.asList(1, 50) ) { + new PerSampleDownsamplingReadsIteratorTest(downsamplerFactory, + targetCoverage, + numSamples, + minContigs, + maxContigs, + streamDensity, + streamStackDepth, + streamStacksPerContig, + unmappedReadsFraction, + unmappedReadsCount, + verifySortedness); + } + } + } + } + } + } + } + + return PerSampleDownsamplingReadsIteratorTest.getTests(PerSampleDownsamplingReadsIteratorTest.class); + } + + @Test(dataProvider = "PerSampleDownsamplingReadsIteratorTestDataProvider") + public void runPerSampleDownsamplingReadsIteratorTest( PerSampleDownsamplingReadsIteratorTest test ) { + logger.warn("Running test: " + test); + + Utils.resetRandomGenerator(); + test.run(); + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/ReservoirDownsamplerUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/ReservoirDownsamplerUnitTest.java new file mode 100644 index 000000000..88a1c5d5c --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/ReservoirDownsamplerUnitTest.java @@ -0,0 +1,133 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.downsampling; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.downsampling.ReadsDownsampler; +import org.broadinstitute.gatk.utils.downsampling.ReservoirDownsampler; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +public class ReservoirDownsamplerUnitTest extends BaseTest { + + private static class ReservoirDownsamplerTest extends TestDataProvider { + int reservoirSize; + int totalReads; + int expectedNumReadsAfterDownsampling; + int expectedNumDiscardedItems; + + public ReservoirDownsamplerTest( int reservoirSize, int totalReads ) { + super(ReservoirDownsamplerTest.class); + + this.reservoirSize = reservoirSize; + this.totalReads = totalReads; + + expectedNumReadsAfterDownsampling = Math.min(reservoirSize, totalReads); + expectedNumDiscardedItems = totalReads <= reservoirSize ? 0 : totalReads - reservoirSize; + + setName(String.format("%s: reservoirSize=%d totalReads=%d expectedNumReadsAfterDownsampling=%d expectedNumDiscardedItems=%d", + getClass().getSimpleName(), reservoirSize, totalReads, expectedNumReadsAfterDownsampling, expectedNumDiscardedItems)); + } + + public Collection createReads() { + Collection reads = new ArrayList(totalReads); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100)); + + return reads; + } + } + + @DataProvider(name = "ReservoirDownsamplerTestDataProvider") + public Object[][] createReservoirDownsamplerTestData() { + for ( int reservoirSize = 1; reservoirSize <= 10000; reservoirSize *= 10 ) { + new ReservoirDownsamplerTest(reservoirSize, 0); + for ( int totalReads = 1; totalReads <= 10000; totalReads *= 10 ) { + new ReservoirDownsamplerTest(reservoirSize, totalReads); + } + } + + return ReservoirDownsamplerTest.getTests(ReservoirDownsamplerTest.class); + } + + @Test(dataProvider = "ReservoirDownsamplerTestDataProvider") + public void testReservoirDownsampler( ReservoirDownsamplerTest test ) { + logger.warn("Running test: " + test); + + Utils.resetRandomGenerator(); + + ReadsDownsampler downsampler = new ReservoirDownsampler(test.reservoirSize); + + downsampler.submit(test.createReads()); + + if ( test.totalReads > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.totalReads > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + Assert.assertEquals(downsampler.size(), test.expectedNumReadsAfterDownsampling); + List downsampledReads = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + Assert.assertEquals(downsampledReads.size(), test.expectedNumReadsAfterDownsampling); + + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.expectedNumDiscardedItems); + Assert.assertEquals(test.totalReads - downsampledReads.size(), test.expectedNumDiscardedItems); + + downsampler.resetStats(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/SimplePositionalDownsamplerUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/SimplePositionalDownsamplerUnitTest.java new file mode 100644 index 000000000..c22a3eaed --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/SimplePositionalDownsamplerUnitTest.java @@ -0,0 +1,333 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.downsampling; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.downsampling.ReadsDownsampler; +import org.broadinstitute.gatk.utils.downsampling.SimplePositionalDownsampler; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.util.*; + +public class SimplePositionalDownsamplerUnitTest extends BaseTest { + + private static class SimplePositionalDownsamplerTest extends TestDataProvider { + int targetCoverage; + int numStacks; + List stackSizes; + List expectedStackSizes; + boolean multipleContigs; + int totalInitialReads; + + public SimplePositionalDownsamplerTest( int targetCoverage, List stackSizes, boolean multipleContigs ) { + super(SimplePositionalDownsamplerTest.class); + + this.targetCoverage = targetCoverage; + this.numStacks = stackSizes.size(); + this.stackSizes = stackSizes; + this.multipleContigs = multipleContigs; + + calculateExpectedDownsampledStackSizes(); + + totalInitialReads = 0; + for ( Integer stackSize : stackSizes ) { + totalInitialReads += stackSize; + } + + setName(String.format("%s: targetCoverage=%d numStacks=%d stackSizes=%s expectedSizes=%s multipleContigs=%b", + getClass().getSimpleName(), targetCoverage, numStacks, stackSizes, expectedStackSizes, multipleContigs)); + } + + public Collection createReads() { + Collection reads = new ArrayList(); + SAMFileHeader header = multipleContigs ? + ArtificialSAMUtils.createArtificialSamHeader(2, 1, 1000000) : + ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + int refIndex = 0; + int alignmentStart = 1; + int readLength = 100; + + for ( int i = 0; i < numStacks; i++ ) { + if ( multipleContigs && refIndex == 0 && i >= numStacks / 2 ) { + refIndex++; + } + + reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(stackSizes.get(i), header, "foo", + refIndex, alignmentStart, readLength)); + + alignmentStart += 10; + } + + return reads; + } + + private void calculateExpectedDownsampledStackSizes() { + expectedStackSizes = new ArrayList(numStacks); + + for ( Integer stackSize : stackSizes ) { + int expectedSize = targetCoverage >= stackSize ? stackSize : targetCoverage; + expectedStackSizes.add(expectedSize); + } + } + } + + @DataProvider(name = "SimplePositionalDownsamplerTestDataProvider") + public Object[][] createSimplePositionalDownsamplerTestData() { + Utils.resetRandomGenerator(); + + for ( int targetCoverage = 1; targetCoverage <= 10000; targetCoverage *= 10 ) { + for ( int contigs = 1; contigs <= 2; contigs++ ) { + for ( int numStacks = 0; numStacks <= 10; numStacks++ ) { + List stackSizes = new ArrayList(numStacks); + for ( int stack = 1; stack <= numStacks; stack++ ) { + stackSizes.add(Utils.getRandomGenerator().nextInt(targetCoverage * 2) + 1); + } + new SimplePositionalDownsamplerTest(targetCoverage, stackSizes, contigs > 1); + } + } + } + + return SimplePositionalDownsamplerTest.getTests(SimplePositionalDownsamplerTest.class); + } + + @Test( dataProvider = "SimplePositionalDownsamplerTestDataProvider" ) + public void testSimplePostionalDownsampler( SimplePositionalDownsamplerTest test ) { + logger.warn("Running test: " + test); + + Utils.resetRandomGenerator(); + + ReadsDownsampler downsampler = new SimplePositionalDownsampler(test.targetCoverage); + + downsampler.submit(test.createReads()); + + if ( test.numStacks > 1 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else if ( test.numStacks == 1 ) { + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.numStacks > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List downsampledReads = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + if ( test.numStacks == 0 ) { + Assert.assertTrue(downsampledReads.isEmpty()); + } + else { + List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampledReads); + + Assert.assertEquals(downsampledStackSizes.size(), test.numStacks); + Assert.assertEquals(downsampledStackSizes, test.expectedStackSizes); + + int numReadsActuallyEliminated = test.totalInitialReads - downsampledReads.size(); + int numReadsReportedEliminated = downsampler.getNumberOfDiscardedItems(); + Assert.assertEquals(numReadsActuallyEliminated, numReadsReportedEliminated); + } + + downsampler.resetStats(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + } + + private List getDownsampledStackSizesAndVerifySortedness( List downsampledReads ) { + List stackSizes = new ArrayList(); + + if ( downsampledReads.isEmpty() ) { + return stackSizes; + } + + Iterator iter = downsampledReads.iterator(); + Assert.assertTrue(iter.hasNext()); + + SAMRecord previousRead = iter.next(); + int currentStackSize = 1; + + while ( iter.hasNext() ) { + SAMRecord currentRead = iter.next(); + + if ( currentRead.getReferenceIndex() > previousRead.getReferenceIndex() || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) { + stackSizes.add(currentStackSize); + currentStackSize = 1; + } + else if ( currentRead.getReferenceIndex() < previousRead.getReferenceIndex() || currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) { + Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead)); + } + else { + currentStackSize++; + } + + previousRead = currentRead; + } + + stackSizes.add(currentStackSize); + return stackSizes; + } + + @Test + public void testSimplePositionalDownsamplerSignalNoMoreReadsBefore() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(1000); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection readStack = new ArrayList(); + readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(50, header, "foo", 0, 1, 100)); + downsampler.submit(readStack); + + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + + SAMRecord laterRead = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 2, 100); + downsampler.signalNoMoreReadsBefore(laterRead); + + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + Assert.assertEquals(downsampledReads.size(), readStack.size()); + } + + @Test + public void testBasicUnmappedReadsSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(100); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection readStack = new ArrayList(); + readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, + SAMRecord.NO_ALIGNMENT_START, 100)); + for ( SAMRecord read : readStack ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + downsampler.submit(readStack); + downsampler.signalEndOfInput(); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + // Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler + Assert.assertEquals(downsampledReads.size(), readStack.size()); + + for ( SAMRecord read: downsampledReads ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + } + + @Test + public void testMixedMappedAndUnmappedReadsSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(100); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection mappedReadStack = new ArrayList(); + mappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", 0, 1, 100)); + for ( SAMRecord read : mappedReadStack ) { + Assert.assertFalse(read.getReadUnmappedFlag()); + } + + Collection unmappedReadStack = new ArrayList(); + unmappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, + SAMRecord.NO_ALIGNMENT_START, 100)); + for ( SAMRecord read : unmappedReadStack ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + downsampler.submit(mappedReadStack); + downsampler.submit(unmappedReadStack); + downsampler.signalEndOfInput(); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + // Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler + Assert.assertEquals(downsampledReads.size(), 300); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 100); + + int count = 1; + for ( SAMRecord read: downsampledReads ) { + if ( count <= 100 ) { + Assert.assertFalse(read.getReadUnmappedFlag()); + } + else { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + count++; + } + } + + @Test + public void testGATKSAMRecordSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(1000); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + List reads = new ArrayList(); + for ( int i = 0; i < 10; i++ ) { + reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10)); + } + + downsampler.submit(reads); + downsampler.signalEndOfInput(); + List downsampledReads = downsampler.consumeFinalizedItems(); + + Assert.assertEquals(downsampledReads.size(), 10); + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/executive/ReduceTreeUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/executive/ReduceTreeUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/executive/ReduceTreeUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/executive/ReduceTreeUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/AllowNCigarMalformedReadFilterUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/AllowNCigarMalformedReadFilterUnitTest.java new file mode 100644 index 000000000..d3fb18896 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/AllowNCigarMalformedReadFilterUnitTest.java @@ -0,0 +1,77 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.Collections; + + +/** + * Tests for the {@link MalformedReadFilter} when the unsafe flag + * {@link ValidationExclusion.TYPE#ALLOW_N_CIGAR_READS} is set. + * + * @author Valentin Ruano-Rubio + * @since 6/6/13 + */ +public class AllowNCigarMalformedReadFilterUnitTest extends MalformedReadFilterUnitTest { + + + @Override + protected ValidationExclusion composeValidationExclusion() { + return new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALLOW_N_CIGAR_READS)); + } + + + @Test(enabled = true, + dataProvider= "UnsupportedCigarOperatorDataProvider") + @CigarOperatorTest(CigarOperatorTest.Outcome.IGNORE) + public void testCigarNOperatorFilterIgnore(final String cigarString) { + + final MalformedReadFilter filter = buildMalformedReadFilter(false); + final SAMRecord nContainingCigarRead = buildSAMRecord(cigarString); + Assert.assertFalse(filter.filterOut(nContainingCigarRead), + "filters out N containing Cigar when it should ignore the fact"); + } + + @Test(enabled = false) + @Override + public void testCigarNOperatorFilterException(final String cigarString) { + // Nothing to do here. + // Just deactivates the parents test case. + } + + + + + + + +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/BadCigarFilterUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/BadCigarFilterUnitTest.java new file mode 100644 index 000000000..f774af092 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/BadCigarFilterUnitTest.java @@ -0,0 +1,97 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.Cigar; +import org.broadinstitute.gatk.utils.clipping.ReadClipperTestUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.util.List; + +/** + * Checks that the Bad Cigar filter works for all kinds of wonky cigars + * + * @author Mauricio Carneiro + * @since 3/20/12 + */ +public class BadCigarFilterUnitTest { + + public static final String[] BAD_CIGAR_LIST = { + "2D4M", // starting with multiple deletions + "4M2D", // ending with multiple deletions + "3M1I1D", // adjacent indels AND ends in deletion + "1M1I1D2M", // adjacent indels I->D + "1M1D2I1M", // adjacent indels D->I + "1M1I2M1D", // ends in single deletion with insertion in the middle + "4M1D", // ends in single deletion + "1D4M", // starts with single deletion + "2M1D1D2M", // adjacent D's + "1M1I1I1M", // adjacent I's + "1H1D4M", // starting with deletion after H + "1S1D3M", // starting with deletion after S + "1H1S1D3M", // starting with deletion after HS + "4M1D1H", // ending with deletion before H + "3M1D1S", // ending with deletion before S + "3M1D1S1H", // ending with deletion before HS + "10M2H10M", // H in the middle + "10M2S10M", // S in the middle + "1H1S10M2S10M1S1H", // deceiving S in the middle + "1H1S10M2H10M1S1H" // deceiving H in the middle + }; + + BadCigarFilter filter; + + @BeforeClass + public void init() { + filter = new BadCigarFilter(); + } + + @Test(enabled = true) + public void testWonkyCigars() { + for (String cigarString : BAD_CIGAR_LIST) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigarString, 0); + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + } + } + + @Test(enabled = true) + public void testReadCigarLengthMismatch() { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar("4M", 1); + Assert.assertTrue(filter.filterOut(read), read.getCigarString()); + } + + @Test(enabled = true) + public void testGoodCigars() { + List cigarList = ReadClipperTestUtils.generateCigarList(10); + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar, 0); + Assert.assertFalse(filter.filterOut(read), read.getCigarString()); + } + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/BadReadGroupsIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/BadReadGroupsIntegrationTest.java new file mode 100644 index 000000000..f4232067d --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/BadReadGroupsIntegrationTest.java @@ -0,0 +1,52 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.annotations.Test; + + +public class BadReadGroupsIntegrationTest extends WalkerTest { + + @Test + public void testMissingReadGroup() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestPrintReadsWalker -R " + b36KGReference + " -I " + privateTestDir + "missingReadGroup.bam -o /dev/null", + 0, + UserException.ReadMissingReadGroup.class); + executeTest("test Missing Read Group", spec); + } + + @Test + public void testUndefinedReadGroup() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestPrintReadsWalker -R " + b36KGReference + " -I " + privateTestDir + "undefinedReadGroup.bam -o /dev/null", + 0, + UserException.ReadHasUndefinedReadGroup.class); + executeTest("test Undefined Read Group", spec); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilterUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilterUnitTest.java new file mode 100644 index 000000000..405610011 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilterUnitTest.java @@ -0,0 +1,246 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + + +import htsjdk.samtools.Cigar; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.TextCigarCodec; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.exceptions.UserException.UnsupportedCigarOperatorException; + +import java.lang.annotation.*; +import java.lang.reflect.Method; +import java.util.*; + + +/** + * Tests for the MalformedReadFilter + * + * @author Eric Banks + * @since 3/14/13 + */ +public class MalformedReadFilterUnitTest extends ReadFilterTest { + + ////////////////////////////////////// + // Test the checkSeqStored() method // + ////////////////////////////////////// + + @Test(enabled = true) + public void testCheckSeqStored () { + + final GATKSAMRecord goodRead = ArtificialSAMUtils.createArtificialRead(new byte[]{(byte)'A'}, new byte[]{(byte)'A'}, "1M"); + final GATKSAMRecord badRead = ArtificialSAMUtils.createArtificialRead(new byte[]{}, new byte[]{}, "1M"); + badRead.setReadString("*"); + + Assert.assertTrue(MalformedReadFilter.checkSeqStored(goodRead, true)); + Assert.assertFalse(MalformedReadFilter.checkSeqStored(badRead, true)); + + try { + MalformedReadFilter.checkSeqStored(badRead, false); + Assert.assertTrue(false, "We should have exceptioned out in the previous line"); + } catch (UserException e) { } + } + + @Test(enabled = true, dataProvider= "UnsupportedCigarOperatorDataProvider") + @CigarOperatorTest(CigarOperatorTest.Outcome.FILTER) + public void testCigarNOperatorFilterTruePositive(String cigarString) { + + final MalformedReadFilter filter = buildMalformedReadFilter(true); + final SAMRecord nContainingCigarRead = buildSAMRecord(cigarString); + Assert.assertTrue(filter.filterOut(nContainingCigarRead), + " Did not filtered out a N containing CIGAR read"); + } + + @Test(enabled = true, dataProvider= "UnsupportedCigarOperatorDataProvider") + @CigarOperatorTest(CigarOperatorTest.Outcome.ACCEPT) + public void testCigarNOperatorFilterTrueNegative(String cigarString) { + + final MalformedReadFilter filter = buildMalformedReadFilter(true); + final SAMRecord nonNContainingCigarRead = buildSAMRecord(cigarString); + Assert.assertFalse(filter.filterOut(nonNContainingCigarRead), + " Filtered out a non-N containing CIGAR read"); + } + + @Test(enabled = true, + expectedExceptions = UnsupportedCigarOperatorException.class, + dataProvider= "UnsupportedCigarOperatorDataProvider") + @CigarOperatorTest(CigarOperatorTest.Outcome.EXCEPTION) + public void testCigarNOperatorFilterException(final String cigarString) { + + final MalformedReadFilter filter = buildMalformedReadFilter(false); + final SAMRecord nContainingCigarRead = buildSAMRecord(cigarString); + + filter.filterOut(nContainingCigarRead); + } + + @Test(enabled = true, dataProvider="UnsupportedCigarOperatorDataProvider") + @CigarOperatorTest(CigarOperatorTest.Outcome.ACCEPT) + public void testCigarNOperatorFilterControl(final String cigarString) { + + final MalformedReadFilter filter = buildMalformedReadFilter(false); + final SAMRecord nonNContainingCigarRead = buildSAMRecord(cigarString); + + Assert.assertFalse(filter.filterOut(nonNContainingCigarRead)); + } + + protected SAMRecord buildSAMRecord(final String cigarString) { + final Cigar nContainingCigar = TextCigarCodec.decode(cigarString); + return this.createRead(nContainingCigar, 1, 0, 10); + } + + protected MalformedReadFilter buildMalformedReadFilter(final boolean filterRNO) { + return buildMalformedReadFiter(filterRNO,new ValidationExclusion.TYPE[] {}); + } + + protected MalformedReadFilter buildMalformedReadFiter(boolean filterRNO, final ValidationExclusion.TYPE... excl) { + final ValidationExclusion ve = new ValidationExclusion(Arrays.asList(excl)); + + final MalformedReadFilter filter = new MalformedReadFilter(); + + final SAMFileHeader h = getHeader(); + final SAMDataSource ds = getDataSource(); + + final GenomeAnalysisEngine gae = new GenomeAnalysisEngine() { + @Override + public SAMFileHeader getSAMFileHeader() { + return h; + } + + @Override + public SAMDataSource getReadsDataSource() { + return ds; + } + }; + filter.initialize(gae); + filter.filterReadsWithNCigar = filterRNO; + return filter; + } + + @Retention(RetentionPolicy.RUNTIME) + @Target(ElementType.METHOD) + @Inherited + protected @interface CigarOperatorTest { + + enum Outcome { + ANY,ACCEPT,FILTER,EXCEPTION,IGNORE; + + public boolean appliesTo (String cigar) { + boolean hasN = cigar.indexOf('N') != -1; + switch (this) { + case ANY: return true; + case ACCEPT: return !hasN; + case IGNORE: return hasN; + case FILTER: + case EXCEPTION: + default: + return hasN; + + } + } + } + + Outcome value() default Outcome.ANY; + } + + /** + * Cigar test data for unsupported operator test. + * Each element of this array corresponds to a test case. In turn the first element of the test case array is the + * Cigar string for that test case and the second indicates whether it should be filtered due to the presence of a + * unsupported operator + */ + private static final String[] TEST_CIGARS = { + "101M10D20I10M", + "6M14N5M", + "1N", + "101M", + "110N", + "2N4M", + "4M2N", + "3M1I1M", + "1M2I2M", + "1M10N1I1M", + "1M1I1D", + "11N12M1I34M12N" + }; + + @DataProvider(name= "UnsupportedCigarOperatorDataProvider") + public Iterator unsupportedOperatorDataProvider(final Method testMethod) { + final CigarOperatorTest a = resolveCigarOperatorTestAnnotation(testMethod); + final List result = new LinkedList(); + for (final String cigarString : TEST_CIGARS) { + if (a == null || a.value().appliesTo(cigarString)) { + result.add(new Object[] { cigarString }); + } + } + return result.iterator(); + } + + /** + * Gets the most specific {@link CigarOperatorTest} annotation for the + * signature of the test method provided. + *

    + * This in-house implementation is required due to the fact that method + * annotations do not have inheritance. + * + * @param m targeted test method. + * @return null if there is no {@link CigarOperatorTest} + * annotation in this or overridden methods. + */ + private CigarOperatorTest resolveCigarOperatorTestAnnotation(final Method m) { + CigarOperatorTest res = m.getAnnotation(CigarOperatorTest.class); + if (res != null) { + return res; + } + Class c = this.getClass(); + Class p = c.getSuperclass(); + while (p != null && p != Object.class) { + try { + final Method met = p.getDeclaredMethod(m.getName(), + m.getParameterTypes()); + res = met.getAnnotation(CigarOperatorTest.class); + if (res != null) { + break; + } + } catch (NoSuchMethodException e) { + // Its ok; nothing to do here, just keep looking. + } + c = p; + p = c.getSuperclass(); + } + return res; + } + +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformerUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformerUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformerUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformerUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/ReadFilterTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/ReadFilterTest.java new file mode 100644 index 000000000..d997f3758 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/ReadFilterTest.java @@ -0,0 +1,373 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.*; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingMethod; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; + +import java.io.File; +import java.util.*; + +/** + * Class ReadBaseTest + *

    + * This is the base test class for read filter test classes. All read + * filter test cases should extend from this + * class; it sets ups a header mock up to test read filtering. + * + * Feel free to override non-final method to modify the behavior + * (i.e. change how read group id are formatted, or complete a header). + * + *

    + * You can statically determine the number of read-group involved + * in the test by calling {@link #ReadFilterTest(int)} in you constructor. + *

    + * + * Notice that the same header object is shared by all test and + * it is initialized by Junit (calling {@link #beforeClass()}. + * + * @author Valentin Ruano Rubio + * @date May 23, 2013 + */ +public class ReadFilterTest extends BaseTest { + + private static final int DEFAULT_READ_GROUP_COUNT = 5; + private static final int DEFAULT_READER_COUNT = 1; + private static final String DEFAULT_READ_GROUP_PREFIX = "ReadGroup"; + private static final String DEFAULT_PLATFORM_UNIT_PREFIX = "Lane"; + private static final String DEFAULT_SAMPLE_NAME_PREFIX = "Sample"; + private static final String DEFAULT_PLATFORM_PREFIX = "Platform"; + private static final int DEFAULT_CHROMOSOME_COUNT = 1; + private static final int DEFAULT_CHROMOSOME_START_INDEX = 1; + private static final int DEFAULT_CHROMOSOME_SIZE = 1000; + private static final String DEFAULT_SAM_FILE_FORMAT = "readfile-%3d.bam"; + + private final int groupCount; + + private SAMFileHeader header; + + private SAMDataSource dataSource; + + /** + * Constructs a new read-filter test providing the number of read + * groups in the file. + * + * @param groupCount number of read-group in the fictional SAM file, + * must be equal or greater than 1. + */ + protected ReadFilterTest(final int groupCount) { + if (groupCount < 1) { + throw new IllegalArgumentException( + "the read group count must at least be 1"); + } + this.groupCount = groupCount; + } + + + /** + * Gets the data source. + * + * @throws IllegalStateException if the data source was not initialized + * invoking {@link #beforeClass()} + * @return never null + */ + protected final SAMDataSource getDataSource() { + checkDataSourceExists(); + return dataSource; + } + + /** + * Returns the mock-up SAM file header for testing. + * + * @throws IllegalStateException if the header was not initialized + * invoking {@link #beforeClass()} + * @return never null + */ + protected final SAMFileHeader getHeader() { + checkHeaderExists(); + return header; + } + + /** + * Construct a read filter test with the default number of groups + * ({@link #DEFAULT_READ_GROUP_COUNT}. + */ + public ReadFilterTest() { + this(DEFAULT_READ_GROUP_COUNT); + } + + /** + * Return the number of read groups involved in the test + * @return 1 or greater. + */ + protected final int getReadGroupCount() { + return groupCount; + } + + /** + * Composes the Id for the read group given its index. + * + * This methods must return a unique distinct ID for each possible index and + * it must be the same value each time it is invoked. + * + * @param index the index of the targeted read group in the range + * [1,{@link #getReadGroupCount()}] + * @return never null and must be unique to each possible + * read group index. + */ + protected String composeReadGroupId(final int index) { + checkReadGroupIndex(index); + return DEFAULT_READ_GROUP_PREFIX + index; + } + + /** + * Composes the Platform name for the read group given its index. + * + * This method must always return the same value give an index. + * + * @param index the index of the targeted read group in the range + * [1,{@link #getReadGroupCount()}] + * @return never null. + */ + protected String composePlatformName(final int index) { + checkReadGroupIndex(index); + return DEFAULT_PLATFORM_PREFIX + (((index-1)%2)+1); + } + + + /** + * Composes the Platform unit name for the read group given its index. + * + * @param index the index of the targeted read group in the range + * [1,{@link #getReadGroupCount()}] + * @return never null. + */ + protected String composePlatformUnitName(final int index) { + checkReadGroupIndex(index); + return DEFAULT_PLATFORM_UNIT_PREFIX + (((index-1)%3)+1); + } + + + + /** + * Checks the correctness of a given read group index. + * + * A correct index is any value in the range [1,{@link #getReadGroupCount()}]. + * + * @param index the target index. + * @throws IllegalArgumentException if the input index is not correct. + */ + protected final void checkReadGroupIndex(final int index) { + checkIndex(index,groupCount,"read group"); + } + + + private void checkIndex(final int index, final int max, CharSequence name) { + if (index < 1 || index > max) { + throw new IllegalArgumentException( + name + " index (" + + index + + ") is out of bounds [1," + max + "]"); + } + } + + + /** + * Checks whether the header was initialized. + * + * @throws IllegalStateException if the header was not yet initialized. + */ + protected final void checkHeaderExists() { + if (header == null) { + throw new IllegalArgumentException( + "header has not been initialized;" + + " beforeClass() was not invoked"); + } + } + + /** + * Checks whether the data source was initialized. + * + * @throws IllegalStateException if the data source was not yet initialized. + */ + protected final void checkDataSourceExists() { + if (header == null) { + throw new IllegalArgumentException( + "data source has not been initialized;" + + " beforeClass() was not invoked"); + } + } + + /** + * Returns the ID for a read group given its index. + * + * @param index the index of the targeted read group in the range + * [1,{@link #getReadGroupCount()}] + * @return never null and must be unique to each + * possible read group index. + */ + protected final String getReadGroupId(final int index) { + checkReadGroupIndex(index); + return getHeader().getReadGroups().get(index - 1).getReadGroupId(); + } + + /** + * Returns the platform name for a read group given its index. + * + * @param group the index of the targeted read group in the range + * [1,{@link #getReadGroupCount()}] + * @return never null. + */ + protected final String getPlatformName(final int group) { + checkReadGroupIndex(group); + return getHeader().getReadGroups().get(group - 1).getPlatform(); + } + + /** + * Returns the platform unit for a read group given its index. + * + * @param group the index of the targeted read group in the range + * [1,{@link #getReadGroupCount()}] + * @return never null. + */ + protected final String getPlatformUnit(final int group) { + checkReadGroupIndex(group); + return getHeader().getReadGroups().get(group - 1).getPlatformUnit(); + } + + + /** + * Composes the mock up SAM file header. + * + * It must return an equivalent (equal) value each time it is invoked. + * + * @return never null. + */ + protected SAMFileHeader composeHeader() { + + return ArtificialSAMUtils.createArtificialSamHeader( + DEFAULT_CHROMOSOME_COUNT, DEFAULT_CHROMOSOME_START_INDEX, + DEFAULT_CHROMOSOME_SIZE); + } + + @BeforeClass + public void beforeClass() { + + header = composeHeader(); + dataSource = composeDataSource(); + final List readGroupIDs = new ArrayList(); + final List sampleNames = new ArrayList(); + + for (int i = 1; i <= getReadGroupCount(); i++) { + final String readGroupId = composeReadGroupId(i); + readGroupIDs.add(readGroupId); + sampleNames.add(readGroupId); + } + + ArtificialSAMUtils.createEnumeratedReadGroups( + header, readGroupIDs, sampleNames); + + for (int i = 1; i <= getReadGroupCount(); i++) { + final String readGroupId = readGroupIDs.get(i-1); + final SAMReadGroupRecord groupRecord = header.getReadGroup(readGroupId); + groupRecord.setAttribute("PL", composePlatformName(i)); + groupRecord.setAttribute("PU", composePlatformUnitName(i)); + } + + } + + protected ValidationExclusion composeValidationExclusion() { + return new ValidationExclusion(); + } + + protected SAMDataSource composeDataSource() { + checkHeaderExists(); + final File referenceFile = null; // Not used in this test. + final Set readerIDs = new HashSet<>(1); + final ThreadAllocation ta = new ThreadAllocation(); + final Integer numFileHandles = 1; // I believe that any value would do but need to confirm. + final boolean useOriginalBaseQualities = true; + final ValidationStringency strictness = ValidationStringency.LENIENT; + final Integer readBufferSize = 1; // not relevant. + final DownsamplingMethod downsamplingMethod = DownsamplingMethod.NONE; + final ValidationExclusion exclusionList = composeValidationExclusion(); + final Collection supplementalFilters = Collections.EMPTY_SET; + final boolean includeReadsWithDeletionAtLoci = true; + + final GenomeLocParser glp = new GenomeLocParser(header.getSequenceDictionary()); + final SAMDataSource res = new SAMDataSource( + referenceFile, + readerIDs, + ta, + numFileHandles, + glp, + useOriginalBaseQualities, + strictness, + readBufferSize, + downsamplingMethod, + exclusionList, + supplementalFilters, + includeReadsWithDeletionAtLoci); + + return res; + } + + @AfterClass + public void afterClass() { + header = null; + dataSource = null; + } + + /** + * Creates a read record. + * + * @param cigar the new record CIGAR. + * @param group the new record group index that must be in the range \ + * [1,{@link #getReadGroupCount()}] + * @param reference the reference sequence index (0-based) + * @param start the start position of the read alignment in the reference + * (1-based) + * @return never null + */ + protected SAMRecord createRead(final Cigar cigar, final int group, final int reference, final int start) { + final SAMRecord record = ArtificialSAMUtils.createArtificialRead(cigar); + record.setHeader(getHeader()); + record.setAlignmentStart(start); + record.setReferenceIndex(reference); + record.setAttribute(SAMTag.RG.toString(), getReadGroupId(group)); + return record; + + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilterUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilterUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilterUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilterUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/UnsafeMalformedReadFilterUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/UnsafeMalformedReadFilterUnitTest.java new file mode 100644 index 000000000..343ad656e --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/UnsafeMalformedReadFilterUnitTest.java @@ -0,0 +1,50 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + + +import org.broadinstitute.gatk.utils.ValidationExclusion; + +import java.util.Collections; + + +/** + * Tests for the {@link MalformedReadFilter} when the unsafe flag + * {@link ValidationExclusion.TYPE#ALL} is set. + * + * @author Valentin Ruano-Rubio + * @since 6/6/13 + */ +public class UnsafeMalformedReadFilterUnitTest extends AllowNCigarMalformedReadFilterUnitTest { + + + @Override + protected ValidationExclusion composeValidationExclusion() { + return new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALL)); + } + + +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/io/OutputTrackerUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/io/OutputTrackerUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/io/OutputTrackerUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/io/OutputTrackerUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/io/stubs/ArgumentTypeDescriptorUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/io/stubs/ArgumentTypeDescriptorUnitTest.java new file mode 100644 index 000000000..60e529281 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/io/stubs/ArgumentTypeDescriptorUnitTest.java @@ -0,0 +1,233 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.io.stubs; + +import htsjdk.variant.variantcontext.VariantContext; +import it.unimi.dsi.fastutil.objects.ObjectArrayList; +import htsjdk.samtools.SAMFileWriter; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.commandline.*; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.io.PrintStream; +import java.util.Arrays; +import java.util.Collection; + + +public class ArgumentTypeDescriptorUnitTest extends BaseTest { + + //////////////////////////////////////////////////////////////////// + // This section tests the functionality of the @Output annotation // + //////////////////////////////////////////////////////////////////// + + private class ATDTestCommandLineProgram extends CommandLineProgram { + public int execute() { return 0; } + + @Override + public Collection getArgumentTypeDescriptors() { + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + return Arrays.asList( new SAMFileWriterArgumentTypeDescriptor(engine, System.out), + new OutputStreamArgumentTypeDescriptor(engine, System.out), + new VCFWriterArgumentTypeDescriptor(engine, System.out, null)); + } + + protected abstract class ATDTestOutputArgumentSource { + public abstract Object getOut(); + } + + protected class OutputRequiredSamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = true) + public SAMFileWriter out; + public Object getOut() { return out; } + } + + protected class OutputRequiredVcfArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = true) + public VariantContextWriter out; + public Object getOut() { return out; } + } + + protected class OutputRequiredStreamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = true) + public PrintStream out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredNoDefaultSamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false, defaultToStdout = false) + public SAMFileWriter out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredNoDefaultVcfArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false, defaultToStdout = false) + public VariantContextWriter out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredNoDefaultStreamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false, defaultToStdout = false) + public PrintStream out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredSamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false) + public SAMFileWriter out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredVcfArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false) + public VariantContextWriter out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredStreamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false) + public PrintStream out; + public Object getOut() { return out; } + } + } + + @DataProvider(name = "OutputProvider") + public Object[][] OutputProvider() { + + ObjectArrayList tests = new ObjectArrayList(); + + final ATDTestCommandLineProgram clp = new ATDTestCommandLineProgram(); + + for ( final Object obj : Arrays.asList(clp.new OutputRequiredSamArgumentSource(), clp.new OutputRequiredVcfArgumentSource(), clp.new OutputRequiredStreamArgumentSource()) ) { + for ( final boolean provided : Arrays.asList(true, false) ) { + tests.add(new Object[]{obj, true, true, provided}); + } + } + + for ( final Object obj : Arrays.asList(clp.new OutputNotRequiredSamArgumentSource(), clp.new OutputNotRequiredVcfArgumentSource(), clp.new OutputNotRequiredStreamArgumentSource()) ) { + for ( final boolean provided : Arrays.asList(true, false) ) { + tests.add(new Object[]{obj, false, true, provided}); + } + } + + for ( final Object obj : Arrays.asList(clp.new OutputNotRequiredNoDefaultSamArgumentSource(), clp.new OutputNotRequiredNoDefaultVcfArgumentSource(), clp.new OutputNotRequiredNoDefaultStreamArgumentSource()) ) { + for ( final boolean provided : Arrays.asList(true, false) ) { + tests.add(new Object[]{obj, false, false, provided}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "OutputProvider") + public void testOutput(final ATDTestCommandLineProgram.ATDTestOutputArgumentSource argumentSource, final boolean required, final boolean hasDefault, final boolean provided) { + + final ParsingEngine parser = new ParsingEngine(new ATDTestCommandLineProgram()); + parser.addArgumentSource(argumentSource.getClass()); + parser.parse(provided ? new String[] {"out", "foo"} : new String[] {}); + + try { + parser.loadArgumentsIntoObject(argumentSource); + + if ( !provided && (required || !hasDefault) ) + Assert.assertEquals(argumentSource.getOut(), null); + else if ( !provided ) + Assert.assertNotEquals(argumentSource.getOut(), null); + else if ( argumentSource.getOut() == null || !(argumentSource.getOut() instanceof SAMFileWriterStub) ) // can't test this one case + Assert.assertEquals(!provided, outputIsStdout(argumentSource.getOut())); + + } catch (Exception e) { + throw new ReviewedGATKException(e.getMessage()); + } + } + + @Test + public void testRodBindingsCollection() { + + final ParsingEngine parser = new ParsingEngine(new ATDTestCommandLineProgram()); + + //A list file containing a single VCF + final File listFile = createTempListFile("oneVCF", privateTestDir + "empty.vcf"); + + try { + Object result = ArgumentTypeDescriptor.getRodBindingsCollection(listFile, + parser, + VariantContext.class, + "variant", + new Tags(), + "variantTest"); + if (!(result instanceof RodBindingCollection)) + throw new ReviewedGATKException("getRodBindingsCollection did not return a RodBindingCollection"); + RodBindingCollection rbc = (RodBindingCollection) result; + + Assert.assertEquals(rbc.getType(), VariantContext.class); + Assert.assertEquals(rbc.getRodBindings().size(), 1); + + } catch (IOException e) { + throw new ReviewedGATKException(e.getMessage(), e); + } + + //The same file, now with an extra blank line + final File listFileWithBlank = createTempListFile("oneVCFwithBlankLine", privateTestDir + "empty.vcf", ""); + try { + Object result = ArgumentTypeDescriptor.getRodBindingsCollection(listFileWithBlank, + parser, + VariantContext.class, + "variant", + new Tags(), + "variantTest"); + if (!(result instanceof RodBindingCollection)) + throw new ReviewedGATKException("getRodBindingsCollection did not return a RodBindingCollection"); + RodBindingCollection rbc = (RodBindingCollection) result; + + Assert.assertEquals(rbc.getType(), VariantContext.class); + Assert.assertEquals(rbc.getRodBindings().size(), 1); + + } catch (IOException e) { + throw new ReviewedGATKException(e.getMessage(), e); + } + } + + private static boolean outputIsStdout(final Object out) { + if ( out == null ) { + return false; + } else if ( out instanceof SAMFileWriterStub ) { + return ((SAMFileWriterStub)out).getOutputStream() != System.out; + } else if ( out instanceof VariantContextWriterStub ) { + return ((VariantContextWriterStub)out).getOutputStream() == System.out; + } else if ( out instanceof OutputStreamStub ) { + return ((OutputStreamStub)out).getOutputStream() == System.out; + } + return false; + } + +} \ No newline at end of file diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/BoundedReadIteratorUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/BoundedReadIteratorUnitTest.java new file mode 100644 index 000000000..b295e1230 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/BoundedReadIteratorUnitTest.java @@ -0,0 +1,144 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.iterators; + +import static org.testng.Assert.fail; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.testng.Assert; + +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; + +import org.testng.annotations.BeforeMethod; + +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + + + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @author aaron + * @version 1.0 + * @date Apr 14, 2009 + *

    + * Class BoundedReadIteratorUnitTest + *

    + * tests for the bounded read iterator. + */ +public class BoundedReadIteratorUnitTest extends BaseTest { + + /** the file list and the fasta sequence */ + private List fl; + private ReferenceSequenceFile seq; + + /** + * This function does the setup of our parser, before each method call. + *

    + * Called before every test case method. + */ + @BeforeMethod + public void doForEachTest() throws FileNotFoundException { + fl = new ArrayList(); + } + + + /** Test out that we can shard the file and iterate over every read */ + @Test + public void testBounding() { + logger.warn("Executing testBounding"); + // total reads expected + final int expected = 20; + // bound by ten reads + BoundedReadIterator iter = new BoundedReadIterator(new testIterator(), expected); + + int count = 0; + for (SAMRecord rec: iter) { + count++; + } + + Assert.assertEquals(count, expected); + } +} + +class testIterator implements GATKSAMIterator { + SAMFileHeader header; + testIterator() { + header = ArtificialSAMUtils.createArtificialSamHeader(1,1,2000); + } + + public void close() { + + } + + public boolean hasNext() { + return true; + } + + public SAMRecord next() { + return ArtificialSAMUtils.createArtificialRead(header,"blah",0,1,100); + } + + public void remove() { + } + + public Iterator iterator() { + return this; + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/GATKSAMIteratorAdapterUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/GATKSAMIteratorAdapterUnitTest.java new file mode 100644 index 000000000..fc7465de3 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/GATKSAMIteratorAdapterUnitTest.java @@ -0,0 +1,179 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.iterators; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.util.CloseableIterator; +import org.broadinstitute.gatk.utils.BaseTest; +import static org.testng.Assert.assertEquals; + +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIteratorAdapter; +import org.testng.annotations.Test; + +import java.util.Iterator; + +/** + * + * User: aaron + * Date: May 13, 2009 + * Time: 6:58:21 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + * @version 1.0 + * @date May 13, 2009 + *

    + * Class GATKSAMIteratorTest + *

    + * Tests the GATKSAMIteratorAdapter class. + */ +public class GATKSAMIteratorAdapterUnitTest extends BaseTest { + + class MyTestIterator implements Iterator { + + public int count = 0; + + public MyTestIterator() { + count = 0; + } + + public boolean hasNext() { + if (count < 100) { + ++count; + return true; + } else { + return false; + } + } + + public SAMRecord next() { + return null; + } + + public void remove() { + throw new UnsupportedOperationException("Unsupported"); + } + } + + class MyTestCloseableIterator implements CloseableIterator { + public int count = 0; + + public MyTestCloseableIterator() { + count = 0; + } + + public boolean hasNext() { + if (count < 100) { + ++count; + return true; + } else { + return false; + } + } + + public SAMRecord next() { + return null; + } + + public void remove() { + throw new UnsupportedOperationException("Unsupported"); + } + + public void close() { + count = -1; + } + } + + + @Test + public void testNormalIterator() { + final int COUNT = 100; + MyTestIterator it = new MyTestIterator(); + + GATKSAMIterator samIt = GATKSAMIteratorAdapter.adapt(it); + int countCheck = 0; + while (samIt.hasNext()) { + samIt.next(); + ++countCheck; + //logger.warn("cnt = " + countCheck); + } + + assertEquals(countCheck, COUNT); + + assertEquals(countCheck, COUNT); + } + + @Test + public void testCloseableIterator() { + final int COUNT = 100; + + MyTestCloseableIterator it = new MyTestCloseableIterator(); + + GATKSAMIterator samIt = GATKSAMIteratorAdapter.adapt(it); + + int countCheck = 0; + while (samIt.hasNext()) { + samIt.next(); + ++countCheck; + } + + assertEquals(countCheck, COUNT); + } + + @Test + public void testCloseOnCloseableIterator() { + final int COUNT = 100; + + MyTestCloseableIterator it = new MyTestCloseableIterator(); + + GATKSAMIterator samIt = GATKSAMIteratorAdapter.adapt(it); + + + int countCheck = 0; + while (samIt.hasNext()) { + samIt.next(); + ++countCheck; + } + + assertEquals(countCheck, COUNT); + + // check to see that the count get's set to -1 + samIt.close(); + assertEquals(it.count, -1); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/MisencodedBaseQualityUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/MisencodedBaseQualityUnitTest.java new file mode 100644 index 000000000..994de2b28 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/MisencodedBaseQualityUnitTest.java @@ -0,0 +1,99 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.iterators; + + +import htsjdk.samtools.SAMFileHeader; +import org.broadinstitute.gatk.engine.iterators.MisencodedBaseQualityReadTransformer; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * Basic unit test for misencoded quals + */ +public class MisencodedBaseQualityUnitTest extends BaseTest { + + private static final String readBases = "AAAAAAAAAA"; + private static final byte[] badQuals = { 59, 60, 62, 63, 64, 61, 62, 58, 57, 56 }; + private static final byte[] goodQuals = { 60, 60, 60, 60, 60, 60, 60, 60, 60, 60 }; + private static final byte[] fixedQuals = { 28, 29, 31, 32, 33, 30, 31, 27, 26, 25 }; + private SAMFileHeader header; + + @BeforeMethod + public void before() { + // reset the read counter so that we are deterministic + MisencodedBaseQualityReadTransformer.currentReadCounter = 0; + header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + } + + private GATKSAMRecord createRead(final boolean useGoodBases) { + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, readBases.getBytes(), + useGoodBases ? Arrays.copyOf(goodQuals, goodQuals.length) : + Arrays.copyOf(badQuals, badQuals.length)); + read.setCigarString("10M"); + return read; + } + + @Test(enabled = true) + public void testGoodQuals() { + final List reads = new ArrayList(10000); + for ( int i = 0; i < 10000; i++ ) + reads.add(createRead(true)); + + testEncoding(reads); + } + + @Test(enabled = true, expectedExceptions = {UserException.class}) + public void testBadQualsThrowsError() { + final List reads = new ArrayList(10000); + for ( int i = 0; i < 10000; i++ ) + reads.add(createRead(false)); + + testEncoding(reads); + } + + @Test(enabled = true) + public void testFixBadQuals() { + final GATKSAMRecord read = createRead(false); + final GATKSAMRecord fixedRead = MisencodedBaseQualityReadTransformer.fixMisencodedQuals(read); + for ( int i = 0; i < fixedQuals.length; i++ ) + Assert.assertEquals(fixedQuals[i], fixedRead.getBaseQualities()[i]); + } + + private void testEncoding(final List reads) { + for ( final GATKSAMRecord read : reads ) + MisencodedBaseQualityReadTransformer.checkForMisencodedQuals(read); + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/ReadFormattingIteratorUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/ReadFormattingIteratorUnitTest.java new file mode 100644 index 000000000..c12bb1551 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/ReadFormattingIteratorUnitTest.java @@ -0,0 +1,52 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.iterators; + +import htsjdk.samtools.*; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIteratorAdapter; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.Arrays; + + +public class ReadFormattingIteratorUnitTest extends BaseTest { + + @Test + public void testIteratorConsolidatesCigars() { + final Cigar unconsolidatedCigar = TextCigarCodec.decode("3M0M5M0M"); + final SAMRecord unconsolidatedRead = ArtificialSAMUtils.createArtificialRead(unconsolidatedCigar); + + final GATKSAMIterator readIterator = GATKSAMIteratorAdapter.adapt(Arrays.asList(unconsolidatedRead).iterator()); + final ReadFormattingIterator formattingIterator = new ReadFormattingIterator(readIterator, false, (byte)-1); + final SAMRecord postIterationRead = formattingIterator.next(); + + Assert.assertEquals(postIterationRead.getCigarString(), "8M", "Cigar 3M0M5M0M not consolidated correctly by ReadFormattingIterator"); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/VerifyingSamIteratorUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/VerifyingSamIteratorUnitTest.java new file mode 100644 index 000000000..c7e7d05d5 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/VerifyingSamIteratorUnitTest.java @@ -0,0 +1,129 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.iterators; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIteratorAdapter; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: mhanna + * Date: Mar 2, 2011 + * Time: 9:48:10 PM + * To change this template use File | Settings | File Templates. + */ +public class VerifyingSamIteratorUnitTest { + private SAMFileHeader samFileHeader; + + @BeforeClass + public void init() { + SAMSequenceDictionary sequenceDictionary = new SAMSequenceDictionary(); + sequenceDictionary.addSequence(new SAMSequenceRecord("1",500)); + sequenceDictionary.addSequence(new SAMSequenceRecord("2",500)); + + samFileHeader = new SAMFileHeader(); + samFileHeader.setSequenceDictionary(sequenceDictionary); + } + + @Test + public void testSortedReadsBasic() { + SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read1",getContig(0).getSequenceIndex(),1,10); + SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(0).getSequenceIndex(),2,10); + List reads = Arrays.asList(read1,read2); + + VerifyingSamIterator iterator = new VerifyingSamIterator(GATKSAMIteratorAdapter.adapt(reads.iterator())); + + Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); + Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); + Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); + Assert.assertSame(iterator.next(),read2,"Incorrect read in read 2 position"); + Assert.assertFalse(iterator.hasNext(),"Too many reads in iterator"); + } + + @Test + public void testSortedReadsAcrossContigs() { + SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read1",getContig(0).getSequenceIndex(),2,10); + SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(1).getSequenceIndex(),1,10); + List reads = Arrays.asList(read1,read2); + + VerifyingSamIterator iterator = new VerifyingSamIterator(GATKSAMIteratorAdapter.adapt(reads.iterator())); + + Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); + Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); + Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); + Assert.assertSame(iterator.next(),read2,"Incorrect read in read 2 position"); + Assert.assertFalse(iterator.hasNext(),"Too many reads in iterator"); + } + + @Test(expectedExceptions=UserException.MissortedBAM.class) + public void testImproperlySortedReads() { + SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read1",getContig(0).getSequenceIndex(),2,10); + SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(0).getSequenceIndex(),1,10); + List reads = Arrays.asList(read1,read2); + + VerifyingSamIterator iterator = new VerifyingSamIterator(GATKSAMIteratorAdapter.adapt(reads.iterator())); + + Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); + Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); + Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); + + // Should trigger MissortedBAM exception. + iterator.next(); + } + + @Test(expectedExceptions=UserException.MalformedBAM.class) + public void testInvalidAlignment() { + // Create an invalid alignment state. + SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read1",getContig(0).getSequenceIndex(),1,10); + SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read1",getContig(0).getSequenceIndex(),2,10); + read1.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); + List reads = Arrays.asList(read1,read2); + + VerifyingSamIterator iterator = new VerifyingSamIterator(GATKSAMIteratorAdapter.adapt(reads.iterator())); + + Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); + Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); + Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); + + // Should trigger MalformedBAM exception. + iterator.next(); + } + + private SAMSequenceRecord getContig(final int contigIndex) { + return samFileHeader.getSequence(contigIndex); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReportUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReportUnitTest.java new file mode 100644 index 000000000..3042e3082 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReportUnitTest.java @@ -0,0 +1,358 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.phonehome; + +import org.broadinstitute.gatk.engine.walkers.*; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; +import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.jets3t.service.S3Service; +import org.jets3t.service.S3ServiceException; +import org.jets3t.service.ServiceException; +import org.jets3t.service.model.S3Object; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.FileInputStream; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; + +public class GATKRunReportUnitTest extends BaseTest { + private final static boolean DEBUG = false; + private static final long S3_PUT_TIMEOUT_IN_MILLISECONDS_FOR_TESTING = 30 * 1000; + private static final String AWS_DOWNLOADER_CREDENTIALS_PROPERTIES_FILE = privateTestDir + "phonehome/awsDownloaderCredentials.properties"; + + private Walker walker; + private Exception exception; + private GenomeAnalysisEngine engine; + private String downloaderAccessKey; + private String downloaderSecretKey; + + @BeforeClass + public void setup() throws Exception { + walker = new RunReportDummyReadWalker(); + exception = new IllegalArgumentException("javaException"); + engine = new GenomeAnalysisEngine(); + engine.setArguments(new GATKArgumentCollection()); + + Properties awsProperties = new Properties(); + awsProperties.load(new FileInputStream(AWS_DOWNLOADER_CREDENTIALS_PROPERTIES_FILE)); + downloaderAccessKey = awsProperties.getProperty("accessKey"); + downloaderSecretKey = awsProperties.getProperty("secretKey"); + } + + @Test(enabled = ! DEBUG) + public void testAWSKeysAreValid() { + // throws an exception if they aren't + GATKRunReport.checkAWSAreValid(); + } + + @Test(enabled = ! DEBUG) + public void testAccessKey() throws Exception { + testAWSKey(GATKRunReport.getAWSUploadAccessKey(), GATKRunReport.AWS_ACCESS_KEY_MD5); + } + + @Test(enabled = ! DEBUG) + public void testSecretKey() throws Exception { + testAWSKey(GATKRunReport.getAWSUploadSecretKey(), GATKRunReport.AWS_SECRET_KEY_MD5); + } + + private void testAWSKey(final String accessKey, final String expectedMD5) throws Exception { + Assert.assertNotNull(accessKey, "AccessKey should not be null"); + final String actualmd5 = Utils.calcMD5(accessKey); + Assert.assertEquals(actualmd5, expectedMD5); + } + + @DataProvider(name = "GATKReportCreationTest") + public Object[][] makeGATKReportCreationTest() { + List tests = new ArrayList(); + + final Walker readWalker = new RunReportDummyReadWalker(); + final Walker lociWalker = new RunReportDummyLocusWalker(); + final Walker rodWalker = new RunReportDummyRodWalker(); + final Walker artWalker = new RunReportDummyActiveRegionWalker(); + + final Exception noException = null; + final Exception javaException = new IllegalArgumentException("javaException"); + final Exception stingException = new ReviewedGATKException("GATKException"); + final Exception userException = new UserException("userException"); + + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + engine.setArguments(new GATKArgumentCollection()); + + for ( final Walker walker : Arrays.asList(readWalker, lociWalker, rodWalker, artWalker) ) { + for ( final Exception exception : Arrays.asList(noException, javaException, stingException, userException) ) { + tests.add(new Object[]{walker, exception, engine}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = !DEBUG, dataProvider = "GATKReportCreationTest") + public void testGATKReportCreationReadingAndWriting(final Walker walker, final Exception exception, final GenomeAnalysisEngine engine) throws Exception { + final GATKRunReport report = new GATKRunReport(walker, exception, engine, GATKRunReport.PhoneHomeOption.STDOUT); + final ByteArrayOutputStream captureStream = new ByteArrayOutputStream(); + final boolean succeeded = report.postReportToStream(captureStream); + Assert.assertTrue(succeeded, "Failed to write report to stream"); + Assert.assertFalse(report.exceptionOccurredDuringPost(), "Post succeeded but report says it failed"); + Assert.assertNull(report.getErrorMessage(), "Post succeeded but there was an error message"); + Assert.assertNull(report.getErrorThrown(), "Post succeeded but there was an error message"); + final InputStream readStream = new ByteArrayInputStream(captureStream.toByteArray()); + + GATKRunReport deserialized = null; + try { + deserialized = GATKRunReport.deserializeReport(readStream); + } catch ( Exception e ) { + final String reportString = new String(captureStream.toByteArray()); + Assert.fail("Failed to deserialize GATK report " + reportString + " with exception " + e); + } + + if ( deserialized != null ) + Assert.assertEquals(report, deserialized); + } + + @DataProvider(name = "GATKAWSReportMode") + public Object[][] makeGATKAWSReportMode() { + List tests = new ArrayList(); + + for ( final GATKRunReport.AWSMode mode : GATKRunReport.AWSMode.values() ) { + tests.add(new Object[]{mode}); + } + + return tests.toArray(new Object[][]{}); + } + + // Will fail with timeout if AWS time out isn't working + // Will fail with exception if AWS doesn't protect itself from errors + @Test(enabled = ! DEBUG, dataProvider = "GATKAWSReportMode", timeOut = S3_PUT_TIMEOUT_IN_MILLISECONDS_FOR_TESTING * 2) + public void testAWS(final GATKRunReport.AWSMode awsMode) { + logger.warn("Starting testAWS mode=" + awsMode); + + // Use a shorter timeout than usual when we're testing GATKRunReport.AWSMode.TIMEOUT + final long thisTestS3Timeout = awsMode == GATKRunReport.AWSMode.TIMEOUT ? 30 * 1000 : S3_PUT_TIMEOUT_IN_MILLISECONDS_FOR_TESTING; + final GATKRunReport report = new GATKRunReport(walker, exception, engine, GATKRunReport.PhoneHomeOption.AWS, thisTestS3Timeout); + report.sendAWSToTestBucket(); + report.setAwsMode(awsMode); + final S3Object s3Object = report.postReportToAWSS3(); + + if ( awsMode == GATKRunReport.AWSMode.NORMAL ) { + Assert.assertNotNull(s3Object, "Upload to AWS failed, s3Object was null. error was " + report.formatError()); + Assert.assertFalse(report.exceptionOccurredDuringPost(), "The upload should have succeeded but the report says it didn't. Error was " + report.formatError()); + Assert.assertNull(report.getErrorMessage(), "Report succeeded but an error message was found"); + Assert.assertNull(report.getErrorThrown(), "Report succeeded but an thrown error was found"); + try { + final GATKRunReport deserialized = GATKRunReport.deserializeReport(downloaderAccessKey, downloaderSecretKey, report.getS3ReportBucket(), s3Object); + Assert.assertEquals(report, deserialized); + deleteFromS3(report); + } catch ( Exception e ) { + Assert.fail("Failed to read, deserialize, or delete GATK report " + s3Object.getName() + " with exception " + e); + } + } else { + Assert.assertNull(s3Object, "AWS upload should have failed for mode " + awsMode + " but got non-null s3 object back " + s3Object + " error was " + report.formatError()); + Assert.assertTrue(report.exceptionOccurredDuringPost(), "S3 object was null but the report says that the upload succeeded"); + Assert.assertNotNull(report.getErrorMessage(), "Report succeeded but an error message wasn't found"); + if ( awsMode == GATKRunReport.AWSMode.FAIL_WITH_EXCEPTION ) + Assert.assertNotNull(report.getErrorThrown()); + } + } + + private void deleteFromS3(final GATKRunReport report) throws Exception { + final S3Service s3Service = GATKRunReport.initializeAWSService(downloaderAccessKey, downloaderSecretKey); + // Retrieve the whole data object we created previously + s3Service.deleteObject(report.getS3ReportBucket(), report.getReportFileName()); + } + + @DataProvider(name = "PostReportByType") + public Object[][] makePostReportByType() { + List tests = new ArrayList(); + + for ( final GATKRunReport.PhoneHomeOption et : GATKRunReport.PhoneHomeOption.values() ) { + tests.add(new Object[]{et}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = ! DEBUG, dataProvider = "PostReportByType", timeOut = S3_PUT_TIMEOUT_IN_MILLISECONDS_FOR_TESTING * 2) + public void testPostReportByType(final GATKRunReport.PhoneHomeOption type) { + final GATKRunReport report = new GATKRunReport(walker, exception, engine, GATKRunReport.PhoneHomeOption.AWS, S3_PUT_TIMEOUT_IN_MILLISECONDS_FOR_TESTING); + Assert.assertFalse(report.exceptionOccurredDuringPost(), "An exception occurred during posting the report"); + final boolean succeeded = report.postReport(type); + + if ( type == GATKRunReport.PhoneHomeOption.NO_ET ) + Assert.assertFalse(succeeded, "NO_ET option shouldn't write a report"); + else { + Assert.assertTrue(succeeded, "Any non NO_ET option should succeed in writing a report"); + + if ( type == GATKRunReport.PhoneHomeOption.STDOUT ) { + // nothing to do + } else { + // must have gone to AWS + try { + Assert.assertTrue(report.wentToAWS(), "The report should have gone to AWS but the report says it wasn't"); + deleteFromS3(report); + } catch ( Exception e ) { + Assert.fail("Failed delete GATK report " + report.getReportFileName() + " with exception " + e); + } + } + } + } + + public interface S3Op { + public void apply() throws ServiceException; + } + + // Will fail with timeout if AWS time out isn't working + // Will fail with exception if AWS doesn't protect itself from errors + @Test(timeOut = S3_PUT_TIMEOUT_IN_MILLISECONDS_FOR_TESTING * 2) + public void testAWSPublicKeyHasAccessControls() throws Exception { + final GATKRunReport report = new GATKRunReport(walker, exception, engine, GATKRunReport.PhoneHomeOption.AWS, S3_PUT_TIMEOUT_IN_MILLISECONDS_FOR_TESTING); + report.sendAWSToTestBucket(); + final S3Object s3Object = report.postReportToAWSS3(); + Assert.assertNotNull(s3Object, "Upload to AWS failed, s3Object was null. error was " + report.formatError()); + + // create a service with the public key, and make sure it cannot list or delete + final S3Service s3Service = GATKRunReport.initializeAWSService(GATKRunReport.getAWSUploadAccessKey(), GATKRunReport.getAWSUploadSecretKey()); + assertOperationNotAllowed("listAllBuckets", new S3Op() { + @Override + public void apply() throws S3ServiceException { + s3Service.listAllBuckets(); + } + }); + assertOperationNotAllowed("listBucket", new S3Op() { + @Override + public void apply() throws S3ServiceException { s3Service.listObjects(report.getS3ReportBucket()); } + }); + assertOperationNotAllowed("createBucket", new S3Op() { + @Override + public void apply() throws S3ServiceException { s3Service.createBucket("ShouldNotCreate"); } + }); + assertOperationNotAllowed("deleteObject", new S3Op() { + @Override + public void apply() throws ServiceException { s3Service.deleteObject(report.getS3ReportBucket(), report.getReportFileName()); } + }); + } + + private void assertOperationNotAllowed(final String name, final S3Op op) { + try { + op.apply(); + // only gets here if the operation was successful + Assert.fail("Operation " + name + " ran successfully but we expected to it fail"); + } catch ( ServiceException e ) { + Assert.assertEquals(e.getErrorCode(), "AccessDenied"); + } + } + + class RunReportDummyReadWalker extends ReadWalker { + @Override + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; + } + } + + class RunReportDummyLocusWalker extends LocusWalker { + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; + } + } + + class RunReportDummyRodWalker extends RodWalker { + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; + } + } + + class RunReportDummyActiveRegionWalker extends ActiveRegionWalker { + @Override + public ActivityProfileState isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return new ActivityProfileState(ref.getLocus(), 0.0); + } + + @Override + public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; + } + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/samples/PedReaderUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/samples/PedReaderUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/samples/PedReaderUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/samples/PedReaderUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/samples/SampleDBUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/samples/SampleDBUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/samples/SampleDBUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/samples/SampleDBUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/samples/SampleUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/samples/SampleUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/samples/SampleUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/samples/SampleUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/DummyActiveRegionWalker.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/DummyActiveRegionWalker.java new file mode 100644 index 000000000..b32a3db63 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/DummyActiveRegionWalker.java @@ -0,0 +1,116 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.traversals; + +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.engine.walkers.ActiveRegionWalker; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocSortedSet; +import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; +import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState; +import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState; + +import java.util.*; + +/** + * ActiveRegionWalker for unit testing + * + * User: depristo + * Date: 1/15/13 + * Time: 1:28 PM + */ +class DummyActiveRegionWalker extends ActiveRegionWalker { + private final double prob; + private EnumSet states = super.desiredReadStates(); + private GenomeLocSortedSet activeRegions = null; + + protected List isActiveCalls = new ArrayList(); + protected Map mappedActiveRegions = new LinkedHashMap(); + private boolean declareHavingPresetRegions = false; + + public DummyActiveRegionWalker() { + this(1.0); + } + + public DummyActiveRegionWalker(double constProb) { + this.prob = constProb; + } + + public DummyActiveRegionWalker(GenomeLocSortedSet activeRegions, EnumSet wantStates, final boolean declareHavingPresetRegions) { + this(activeRegions, declareHavingPresetRegions); + this.states = wantStates; + } + + public DummyActiveRegionWalker(GenomeLocSortedSet activeRegions, final boolean declareHavingPresetRegions) { + this(1.0); + this.activeRegions = activeRegions; + this.declareHavingPresetRegions = declareHavingPresetRegions; + } + + public void setStates(EnumSet states) { + this.states = states; + } + + @Override + public boolean hasPresetActiveRegions() { + return declareHavingPresetRegions; + } + + @Override + public GenomeLocSortedSet getPresetActiveRegions() { + return declareHavingPresetRegions ? activeRegions : null; + } + + @Override + public EnumSet desiredReadStates() { + return states; + } + + @Override + public ActivityProfileState isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + isActiveCalls.add(ref.getLocus()); + final double p = activeRegions == null || activeRegions.overlaps(ref.getLocus()) ? prob : 0.0; + return new ActivityProfileState(ref.getLocus(), p); + } + + @Override + public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { + mappedActiveRegions.put(activeRegion.getLocation(), activeRegion); + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCacheUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCacheUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCacheUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCacheUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegionsUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegionsUnitTest.java new file mode 100644 index 000000000..5b710a10d --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegionsUnitTest.java @@ -0,0 +1,680 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.traversals; + +import com.google.java.contract.PreconditionError; +import htsjdk.samtools.*; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.engine.datasources.reads.*; +import org.broadinstitute.gatk.engine.filters.ReadFilter; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.GenomeLocSortedSet; +import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState; +import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; +import org.broadinstitute.gatk.utils.interval.IntervalUtils; +import org.broadinstitute.gatk.utils.sam.*; +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.datasources.providers.LocusShardDataProvider; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.engine.executive.WindowMaker; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + + +import java.io.File; +import java.io.IOException; +import java.util.*; + +/** + * Created with IntelliJ IDEA. + * User: thibault + * Date: 11/13/12 + * Time: 2:47 PM + * + * Test the Active Region Traversal Contract + * http://iwww.broadinstitute.org/gsa/wiki/index.php/Active_Region_Traversal_Contract + */ +public class TraverseActiveRegionsUnitTest extends BaseTest { + private final static boolean ENFORCE_CONTRACTS = false; + private final static boolean DEBUG = false; + + @DataProvider(name = "TraversalEngineProvider") + public Object[][] makeTraversals() { + final List traversals = new LinkedList(); + traversals.add(new Object[]{new TraverseActiveRegions<>()}); + return traversals.toArray(new Object[][]{}); + } + + private File referenceFile; + private IndexedFastaSequenceFile reference; + private SAMSequenceDictionary dictionary; + private GenomeLocParser genomeLocParser; + + private List intervals; + + private File testBAM; + + @BeforeClass + private void init() throws IOException { + //reference = new CachingIndexedFastaSequenceFile(new File("/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta")); // hg19Reference)); + referenceFile = new File(hg19Reference); + reference = new CachingIndexedFastaSequenceFile(referenceFile); + dictionary = reference.getSequenceDictionary(); + genomeLocParser = new GenomeLocParser(dictionary); + + // TODO: reads with indels + // TODO: reads which span many regions + // TODO: reads which are partially between intervals (in/outside extension) + // TODO: duplicate reads + // TODO: read at the end of a contig + // TODO: reads which are completely outside intervals but within extension + // TODO: test the extension itself + // TODO: unmapped reads + + intervals = new ArrayList(); + intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20)); + intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999)); + intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999)); + intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999)); + intervals.add(genomeLocParser.createGenomeLoc("1", 10000, 20000)); + intervals.add(genomeLocParser.createGenomeLoc("2", 1, 100)); + intervals.add(genomeLocParser.createGenomeLoc("20", 10000, 10100)); + intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList(); + + List reads = new ArrayList(); + reads.add(buildSAMRecord("simple", "1", 100, 200)); + reads.add(buildSAMRecord("overlap_equal", "1", 10, 20)); + reads.add(buildSAMRecord("overlap_unequal", "1", 10, 21)); + reads.add(buildSAMRecord("boundary_equal", "1", 1990, 2009)); + reads.add(buildSAMRecord("boundary_unequal", "1", 1990, 2008)); + reads.add(buildSAMRecord("boundary_1_pre", "1", 1950, 2000)); + reads.add(buildSAMRecord("boundary_1_post", "1", 1999, 2050)); + reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990)); + reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000)); + reads.add(buildSAMRecord("shard_boundary_1_pre", "1", 16300, 16385)); + reads.add(buildSAMRecord("shard_boundary_1_post", "1", 16384, 16400)); + reads.add(buildSAMRecord("shard_boundary_equal", "1", 16355, 16414)); + reads.add(buildSAMRecord("simple20", "20", 10025, 10075)); + + createBAM(reads); + } + + private void createBAM(List reads) throws IOException { + testBAM = createTempFile("TraverseActiveRegionsUnitTest", ".bam"); + + SAMFileWriter out = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(reads.get(0).getHeader(), true, testBAM); + for (GATKSAMRecord read : ReadUtils.sortReadsByCoordinate(reads)) { + out.addAlignment(read); + } + out.close(); + + new File(testBAM.getAbsolutePath().replace(".bam", ".bai")).deleteOnExit(); + new File(testBAM.getAbsolutePath() + ".bai").deleteOnExit(); + } + + @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") + public void testAllBasesSeen(TraverseActiveRegions t) { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); + + List activeIntervals = getIsActiveIntervals(t, walker, intervals); + // Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call + verifyEqualIntervals(intervals, activeIntervals); + } + + private List getIsActiveIntervals(final TraverseActiveRegions t, DummyActiveRegionWalker walker, List intervals) { + List activeIntervals = new ArrayList(); + for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, testBAM)) { + t.traverse(walker, dataProvider, 0); + activeIntervals.addAll(walker.isActiveCalls); + } + + return activeIntervals; + } + + @Test (enabled = ENFORCE_CONTRACTS, dataProvider = "TraversalEngineProvider", expectedExceptions = PreconditionError.class) + public void testIsActiveRangeLow (TraverseActiveRegions t) { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(-0.1); + getActiveRegions(t, walker, intervals).values(); + } + + @Test (enabled = ENFORCE_CONTRACTS, dataProvider = "TraversalEngineProvider", expectedExceptions = PreconditionError.class) + public void testIsActiveRangeHigh (TraverseActiveRegions t) { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(1.1); + getActiveRegions(t, walker, intervals).values(); + } + + @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") + public void testActiveRegionCoverage(TraverseActiveRegions t) { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals), true); + + Collection activeRegions = getActiveRegions(t, walker, intervals).values(); + verifyActiveRegionCoverage(intervals, activeRegions); + } + + private void verifyActiveRegionCoverage(List intervals, Collection activeRegions) { + List intervalStarts = new ArrayList(); + List intervalStops = new ArrayList(); + + for (GenomeLoc interval : intervals) { + intervalStarts.add(interval.getStartLocation()); + intervalStops.add(interval.getStopLocation()); + } + + Map baseRegionMap = new HashMap(); + + for (ActiveRegion activeRegion : activeRegions) { + for (GenomeLoc activeLoc : toSingleBaseLocs(activeRegion.getLocation())) { + // Contract: Regions do not overlap + Assert.assertFalse(baseRegionMap.containsKey(activeLoc), "Genome location " + activeLoc + " is assigned to more than one region"); + baseRegionMap.put(activeLoc, activeRegion); + } + + GenomeLoc start = activeRegion.getLocation().getStartLocation(); + if (intervalStarts.contains(start)) + intervalStarts.remove(start); + + GenomeLoc stop = activeRegion.getLocation().getStopLocation(); + if (intervalStops.contains(stop)) + intervalStops.remove(stop); + } + + for (GenomeLoc baseLoc : toSingleBaseLocs(intervals)) { + // Contract: Each location in the interval(s) is in exactly one region + // Contract: The total set of regions exactly matches the analysis interval(s) + Assert.assertTrue(baseRegionMap.containsKey(baseLoc), "Genome location " + baseLoc + " is not assigned to any region"); + baseRegionMap.remove(baseLoc); + } + + // Contract: The total set of regions exactly matches the analysis interval(s) + Assert.assertEquals(baseRegionMap.size(), 0, "Active regions contain base(s) outside of the given intervals"); + + // Contract: All explicit interval boundaries must also be region boundaries + Assert.assertEquals(intervalStarts.size(), 0, "Interval start location does not match an active region start location"); + Assert.assertEquals(intervalStops.size(), 0, "Interval stop location does not match an active region stop location"); + } + + @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") + public void testActiveRegionExtensionOnContig(TraverseActiveRegions t) { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); + + Collection activeRegions = getActiveRegions(t, walker, intervals).values(); + for (ActiveRegion activeRegion : activeRegions) { + GenomeLoc loc = activeRegion.getExtendedLoc(); + + // Contract: active region extensions must stay on the contig + Assert.assertTrue(loc.getStart() > 0, "Active region extension begins at location " + loc.getStart() + ", past the left end of the contig"); + int refLen = dictionary.getSequence(loc.getContigIndex()).getSequenceLength(); + Assert.assertTrue(loc.getStop() <= refLen, "Active region extension ends at location " + loc.getStop() + ", past the right end of the contig"); + } + } + + @Test(enabled = true && !DEBUG, dataProvider = "TraversalEngineProvider") + public void testPrimaryReadMapping(TraverseActiveRegions t) { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals), + EnumSet.of(ActiveRegionReadState.PRIMARY), + true); + + // Contract: Each read has the Primary state in a single region (or none) + // This is the region of maximum overlap for the read (earlier if tied) + + // simple: Primary in 1:1-999 + // overlap_equal: Primary in 1:1-999 + // overlap_unequal: Primary in 1:1-999 + // boundary_equal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_1_post: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // extended_and_np: Primary in 1:1-999, Non-Primary in 1:1000-1999, Extended in 1:2000-2999 + // outside_intervals: none + // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // shard_boundary_1_post: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // shard_boundary_equal: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // simple20: Primary in 20:10000-10100 + + Map activeRegions = getActiveRegions(t, walker, intervals); + ActiveRegion region; + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); + verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); + verifyReadMapping(region, "boundary_unequal", "boundary_1_pre", "boundary_equal", "boundary_1_post"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); + verifyReadMapping(region); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 10000, 20000)); + verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); + verifyReadMapping(region, "simple20"); + } + + @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") + public void testNonPrimaryReadMapping(TraverseActiveRegions t) { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals), + EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY), + true); + + // Contract: Each read has the Primary state in a single region (or none) + // This is the region of maximum overlap for the read (earlier if tied) + + // Contract: Each read has the Non-Primary state in all other regions it overlaps + + // simple: Primary in 1:1-999 + // overlap_equal: Primary in 1:1-999 + // overlap_unequal: Primary in 1:1-999 + // boundary_equal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_1_post: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // extended_and_np: Primary in 1:1-999, Non-Primary in 1:1000-1999, Extended in 1:2000-2999 + // outside_intervals: none + // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // shard_boundary_1_post: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // shard_boundary_equal: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // simple20: Primary in 20:10000-10100 + + Map activeRegions = getActiveRegions(t, walker, intervals); + ActiveRegion region; + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); + verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); + verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); + verifyReadMapping(region, "boundary_equal", "boundary_unequal", "boundary_1_pre", "boundary_1_post"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 10000, 20000)); + verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); + verifyReadMapping(region, "simple20"); + } + + @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") + public void testExtendedReadMapping(TraverseActiveRegions t) { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals), + EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED), + true); + + // Contract: Each read has the Primary state in a single region (or none) + // This is the region of maximum overlap for the read (earlier if tied) + + // Contract: Each read has the Non-Primary state in all other regions it overlaps + // Contract: Each read has the Extended state in regions where it only overlaps if the region is extended + + // simple: Primary in 1:1-999 + // overlap_equal: Primary in 1:1-999 + // overlap_unequal: Primary in 1:1-999 + // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 + // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 + // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 + // outside_intervals: none + // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 + // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 + // simple20: Primary in 20:10000-10100 + + Map activeRegions = getActiveRegions(t, walker, intervals); + ActiveRegion region; + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); + verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); + verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); + verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 10000, 20000)); + verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); + verifyReadMapping(region, "simple20"); + } + + @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") + public void testUnmappedReads(TraverseActiveRegions t) { + // TODO + } + + private void verifyReadMapping(ActiveRegion region, String... reads) { + Assert.assertNotNull(region, "Region was unexpectedly null"); + final Set regionReads = new HashSet(); + for (SAMRecord read : region.getReads()) { + Assert.assertFalse(regionReads.contains(read.getReadName()), "Duplicate reads detected in region " + region + " read " + read.getReadName()); + regionReads.add(read.getReadName()); + } + + Collection wantReads = new ArrayList(Arrays.asList(reads)); + for (SAMRecord read : region.getReads()) { + String regionReadName = read.getReadName(); + Assert.assertTrue(wantReads.contains(regionReadName), "Read " + regionReadName + " incorrectly assigned to active region " + region); + wantReads.remove(regionReadName); + } + + Assert.assertTrue(wantReads.isEmpty(), "Reads missing in active region " + region + ", wanted " + (wantReads.isEmpty() ? "" : wantReads.iterator().next())); + } + + private Map getActiveRegions(TraverseActiveRegions t, DummyActiveRegionWalker walker, List intervals) { + return getActiveRegions(t, walker, intervals, testBAM); + } + + private Map getActiveRegions(TraverseActiveRegions t, DummyActiveRegionWalker walker, List intervals, final File bam) { + for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, bam)) + t.traverse(walker, dataProvider, 0); + + return walker.mappedActiveRegions; + } + + private Collection toSingleBaseLocs(GenomeLoc interval) { + List bases = new ArrayList(); + if (interval.size() == 1) + bases.add(interval); + else { + for (int location = interval.getStart(); location <= interval.getStop(); location++) + bases.add(genomeLocParser.createGenomeLoc(interval.getContig(), location, location)); + } + + return bases; + } + + private Collection toSingleBaseLocs(List intervals) { + Set bases = new TreeSet(); // for sorting and uniqueness + for (GenomeLoc interval : intervals) + bases.addAll(toSingleBaseLocs(interval)); + + return bases; + } + + private void verifyEqualIntervals(List aIntervals, List bIntervals) { + Collection aBases = toSingleBaseLocs(aIntervals); + Collection bBases = toSingleBaseLocs(bIntervals); + + Assert.assertTrue(aBases.size() == bBases.size(), "Interval lists have a differing number of bases: " + aBases.size() + " vs. " + bBases.size()); + + Iterator aIter = aBases.iterator(); + Iterator bIter = bBases.iterator(); + while (aIter.hasNext() && bIter.hasNext()) { + GenomeLoc aLoc = aIter.next(); + GenomeLoc bLoc = bIter.next(); + Assert.assertTrue(aLoc.equals(bLoc), "Interval locations do not match: " + aLoc + " vs. " + bLoc); + } + } + + // copied from LocusViewTemplate + protected GATKSAMRecord buildSAMRecord(String readName, String contig, int alignmentStart, int alignmentEnd) { + SAMFileHeader header = ArtificialSAMUtils.createDefaultReadGroup(new SAMFileHeader(), "test", "test"); + header.setSequenceDictionary(dictionary); + header.setSortOrder(SAMFileHeader.SortOrder.coordinate); + GATKSAMRecord record = new GATKSAMRecord(header); + + record.setReadName(readName); + record.setReferenceIndex(dictionary.getSequenceIndex(contig)); + record.setAlignmentStart(alignmentStart); + + Cigar cigar = new Cigar(); + int len = alignmentEnd - alignmentStart + 1; + cigar.add(new CigarElement(len, CigarOperator.M)); + record.setCigar(cigar); + record.setReadString(new String(new char[len]).replace("\0", "A")); + record.setBaseQualities(new byte[len]); + record.setReadGroup(new GATKSAMReadGroupRecord(header.getReadGroup("test"))); + + return record; + } + + private List createDataProviders(TraverseActiveRegions traverseActiveRegions, final Walker walker, List intervals, File bamFile) { + GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + engine.setGenomeLocParser(genomeLocParser); + + Collection samFiles = new ArrayList(); + SAMReaderID readerID = new SAMReaderID(bamFile, new Tags()); + samFiles.add(readerID); + + SAMDataSource dataSource = new SAMDataSource(referenceFile, samFiles, new ThreadAllocation(), null, genomeLocParser, + false, + ValidationStringency.STRICT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + new ArrayList(), + false, (byte)30, false, true, null, IntervalMergingRule.ALL); + + engine.setReadsDataSource(dataSource); + final Set samples = ReadUtils.getSAMFileSamples(dataSource.getHeader()); + + traverseActiveRegions.initialize(engine, walker); + List providers = new ArrayList(); + for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new ActiveRegionShardBalancer())) { + for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs(), samples)) { + providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList())); + } + } + + return providers; + } + + // --------------------------------------------------------------------------------------------------------- + // + // Combinatorial tests to ensure reads are going into the right regions + // + // --------------------------------------------------------------------------------------------------------- + + @DataProvider(name = "CombinatorialARTTilingProvider") + public Object[][] makeCombinatorialARTTilingProvider() { + final List tests = new LinkedList(); + + final List starts = Arrays.asList( + 1, // very start of the chromosome + ArtificialBAMBuilder.BAM_SHARD_SIZE - 100, // right before the shard boundary + ArtificialBAMBuilder.BAM_SHARD_SIZE + 100 // right after the shard boundary + ); + + final List> allReadStates = Arrays.asList( + EnumSet.of(ActiveRegionReadState.PRIMARY), + EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY), + EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED) + ); + + final int maxTests = Integer.MAX_VALUE; + int nTests = 0; + for ( final int readLength : Arrays.asList(100) ) { + for ( final int skips : Arrays.asList(0, 10) ) { + for ( final int start : starts ) { + for ( final int nReadsPerLocus : Arrays.asList(1, 2) ) { + for ( final int nLoci : Arrays.asList(1, 1000) ) { + final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(reference, nReadsPerLocus, nLoci); + bamBuilder.setReadLength(readLength); + bamBuilder.setSkipNLoci(skips); + bamBuilder.setAlignmentStart(start); + for ( EnumSet readStates : allReadStates ) { + for ( final GenomeLocSortedSet activeRegions : enumerateActiveRegions(bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd())) { + nTests++; + if ( nTests < maxTests ) // && nTests == 1238 ) + tests.add(new Object[]{new TraverseActiveRegions<>(), nTests, activeRegions, readStates, bamBuilder}); + } + } + } + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + private Collection enumerateActiveRegions(final int start, final int stop) { + // should basically cut up entire region into equal sized chunks, of + // size 10, 20, 50, 100, etc, alternating skipping pieces so they are inactive + // Need to make sure we include some edge cases: + final List activeRegions = new LinkedList(); + + for ( final int stepSize : Arrays.asList(11, 29, 53, 97) ) { + for ( final boolean startWithActive : Arrays.asList(true, false) ) { + activeRegions.add(makeActiveRegionMask(start, stop, stepSize, startWithActive)); + } + } + + // active region is the whole interval + activeRegions.add(new GenomeLocSortedSet(genomeLocParser, genomeLocParser.createGenomeLoc("1", start, stop))); + + // active region extends up to the end of the data, but doesn't include start + activeRegions.add(new GenomeLocSortedSet(genomeLocParser, genomeLocParser.createGenomeLoc("1", start+10, stop))); + + return activeRegions; + } + + private GenomeLocSortedSet makeActiveRegionMask(final int start, final int stop, final int stepSize, final boolean startWithActive) { + final GenomeLocSortedSet active = new GenomeLocSortedSet(genomeLocParser); + + boolean includeRegion = startWithActive; + for ( int left = start; left < stop; left += stepSize) { + final int right = left + stepSize; + final GenomeLoc region = genomeLocParser.createGenomeLoc("1", left, right); + if ( includeRegion ) + active.add(region); + includeRegion = ! includeRegion; + } + + return active; + } + + + @Test(enabled = true && ! DEBUG, dataProvider = "CombinatorialARTTilingProvider") + public void testARTReadsInActiveRegions(final TraverseActiveRegions traversal, final int id, final GenomeLocSortedSet activeRegions, final EnumSet readStates, final ArtificialBAMBuilder bamBuilder) { + logger.warn("Running testARTReadsInActiveRegions id=" + id + " locs " + activeRegions + " against bam " + bamBuilder); + final List intervals = Arrays.asList( + genomeLocParser.createGenomeLoc("1", bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd()) + ); + + final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions, false); + walker.setStates(readStates); + + final Map activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile()); + + final Set alreadySeenReads = new HashSet(); // for use with the primary / non-primary + for ( final ActiveRegion region : activeRegionsMap.values() ) { + final Set readNamesInRegion = readNamesInRegion(region); + int nReadsExpectedInRegion = 0; + for ( final GATKSAMRecord read : bamBuilder.makeReads() ) { + final GenomeLoc readLoc = genomeLocParser.createGenomeLoc(read); + + boolean shouldBeInRegion = readStates.contains(ActiveRegionReadState.EXTENDED) + ? region.getExtendedLoc().overlapsP(readLoc) + : region.getLocation().overlapsP(readLoc); + + if ( ! readStates.contains(ActiveRegionReadState.NONPRIMARY) ) { + if ( alreadySeenReads.contains(read.getReadName()) ) + shouldBeInRegion = false; + else if ( shouldBeInRegion ) + alreadySeenReads.add(read.getReadName()); + } + + String msg = readNamesInRegion.contains(read.getReadName()) == shouldBeInRegion ? "" : "Region " + region + + " failed contains read check: read " + read + " with span " + readLoc + " should be in region is " + shouldBeInRegion + " but I got the opposite"; + Assert.assertEquals(readNamesInRegion.contains(read.getReadName()), shouldBeInRegion, msg); + + nReadsExpectedInRegion += shouldBeInRegion ? 1 : 0; + } + + Assert.assertEquals(region.size(), nReadsExpectedInRegion, "There are more reads in active region " + region + "than expected"); + } + } + + private Set readNamesInRegion(final ActiveRegion region) { + final Set readNames = new LinkedHashSet(region.getReads().size()); + for ( final SAMRecord read : region.getReads() ) + readNames.add(read.getReadName()); + return readNames; + } + + // --------------------------------------------------------------------------------------------------------- + // + // Make sure all insertion reads are properly included in the active regions + // + // --------------------------------------------------------------------------------------------------------- + + @Test(dataProvider = "TraversalEngineProvider", enabled = true && ! DEBUG) + public void ensureAllInsertionReadsAreInActiveRegions(final TraverseActiveRegions traversal) { + + final int readLength = 10; + final int start = 20; + final int nReadsPerLocus = 10; + final int nLoci = 3; + + final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(reference, nReadsPerLocus, nLoci); + bamBuilder.setReadLength(readLength); + bamBuilder.setAlignmentStart(start); + + // note that the position must be +1 as the read's all I cigar puts the end 1 bp before start, leaving it out of the region + GATKSAMRecord allI = ArtificialSAMUtils.createArtificialRead(bamBuilder.getHeader(),"allI",0,start+1,readLength); + allI.setCigarString(readLength + "I"); + allI.setReadGroup(new GATKSAMReadGroupRecord(bamBuilder.getHeader().getReadGroups().get(0))); + + bamBuilder.addReads(allI); + + final GenomeLocSortedSet activeRegions = new GenomeLocSortedSet(bamBuilder.getGenomeLocParser()); + activeRegions.add(bamBuilder.getGenomeLocParser().createGenomeLoc("1", 10, 30)); + final List intervals = Arrays.asList( + genomeLocParser.createGenomeLoc("1", bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd()) + ); + + final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions, false); + + final Map activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile()); + + final ActiveRegion region = activeRegionsMap.values().iterator().next(); + int nReadsExpectedInRegion = 0; + + final Set readNamesInRegion = readNamesInRegion(region); + for ( final GATKSAMRecord read : bamBuilder.makeReads() ) { + Assert.assertTrue(readNamesInRegion.contains(read.getReadName()), + "Region " + region + " should contain read " + read + " with cigar " + read.getCigarString() + " but it wasn't"); + nReadsExpectedInRegion++; + } + + Assert.assertEquals(region.size(), nReadsExpectedInRegion, "There are more reads in active region " + region + "than expected"); + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseDuplicatesUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseDuplicatesUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseDuplicatesUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseDuplicatesUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsUnitTest.java new file mode 100644 index 000000000..a03802635 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsUnitTest.java @@ -0,0 +1,167 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.traversals; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import org.broadinstitute.gatk.engine.walkers.TestCountReadsWalker; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.datasources.providers.ReadShardDataProvider; +import org.broadinstitute.gatk.engine.datasources.reads.*; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.engine.walkers.ReadWalker; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import static org.testng.Assert.fail; + +/** + * + * User: aaron + * Date: Apr 24, 2009 + * Time: 3:42:16 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + * @version 1.0 + * @date Apr 24, 2009 + *

    + * Class TraverseReadsUnitTest + *

    + * test traversing reads + */ +public class TraverseReadsUnitTest extends BaseTest { + + private ReferenceSequenceFile seq; + private SAMReaderID bam = new SAMReaderID(new File(validationDataLocation + "index_test.bam"),new Tags()); // TCGA-06-0188.aligned.duplicates_marked.bam"); + private File refFile = new File(validationDataLocation + "Homo_sapiens_assembly17.fasta"); + private List bamList; + private ReadWalker countReadWalker; + private File output; + private TraverseReadsNano traversalEngine = null; + + private IndexedFastaSequenceFile ref = null; + private GenomeLocParser genomeLocParser = null; + private GenomeAnalysisEngine engine = null; + + @BeforeClass + public void doOnce() { + try { + ref = new CachingIndexedFastaSequenceFile(refFile); + } + catch(FileNotFoundException ex) { + throw new UserException.CouldNotReadInputFile(refFile,ex); + } + genomeLocParser = new GenomeLocParser(ref); + + engine = new GenomeAnalysisEngine(); + engine.setReferenceDataSource(refFile); + engine.setGenomeLocParser(genomeLocParser); + } + + /** + * This function does the setup of our parser, before each method call. + *

    + * Called before every test case method. + */ + @BeforeMethod + public void doForEachTest() { + output = new File("testOut.txt"); + FileOutputStream out = null; + PrintStream ps; // declare a print stream object + + try { + out = new FileOutputStream(output); + } catch (FileNotFoundException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + fail("Couldn't open the output file"); + } + + bamList = new ArrayList(); + bamList.add(bam); + countReadWalker = new TestCountReadsWalker(); + + traversalEngine = new TraverseReadsNano(1); + traversalEngine.initialize(engine, countReadWalker); + } + + /** Test out that we can shard the file and iterate over every read */ + @Test + public void testUnmappedReadCount() { + SAMDataSource dataSource = new SAMDataSource(refFile, bamList,new ThreadAllocation(),null,genomeLocParser); + Iterable shardStrategy = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); + + countReadWalker.initialize(); + Object accumulator = countReadWalker.reduceInit(); + + for(Shard shard: shardStrategy) { + if (shard == null) { + fail("Shard == null"); + } + + ReadShardDataProvider dataProvider = new ReadShardDataProvider(shard,genomeLocParser,dataSource.seek(shard),null, Collections.emptyList()); + accumulator = traversalEngine.traverse(countReadWalker, dataProvider, accumulator); + dataProvider.close(); + } + + countReadWalker.onTraversalDone(accumulator); + + if (!(accumulator instanceof Long)) { + fail("Count read walker should return a Long."); + } + if (!accumulator.equals(new Long(10000))) { + fail("there should be 10000 mapped reads in the index file, there was " + (accumulator)); + } + } + +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestCountLociWalker.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestCountLociWalker.java new file mode 100644 index 000000000..8b7a8d758 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestCountLociWalker.java @@ -0,0 +1,58 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import org.broadinstitute.gatk.utils.commandline.Output; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; + +import java.io.PrintStream; + +public class TestCountLociWalker extends LocusWalker { + @Output + private PrintStream out; + + @Override + public Integer map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + return 1; + } + + @Override + public Long reduceInit() { + return 0L; + } + + @Override + public Long reduce(final Integer value, final Long sum) { + return value + sum; + } + + @Override + public void onTraversalDone(final Long result) { + out.println(result); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestCountReadsWalker.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestCountReadsWalker.java new file mode 100644 index 000000000..cc0162fc1 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestCountReadsWalker.java @@ -0,0 +1,59 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import org.broadinstitute.gatk.utils.commandline.Output; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.io.PrintStream; + +public class TestCountReadsWalker extends ReadWalker { + @Output + PrintStream out; + + @Override + public Integer map(final ReferenceContext ref, final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker) { + return 1; + } + + @Override + public Long reduceInit() { + return 0L; + } + + @Override + public Long reduce(final Integer value, final Long sum) { + return value + sum; + } + + @Override + public void onTraversalDone(final Long result) { + if (out != null) + out.println(result); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestErrorThrowingWalker.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestErrorThrowingWalker.java new file mode 100644 index 000000000..00774f7b7 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestErrorThrowingWalker.java @@ -0,0 +1,70 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.utils.commandline.Input; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; + +public class TestErrorThrowingWalker extends RefWalker implements TreeReducible, NanoSchedulable { + @Input(fullName = "exception", shortName = "E", doc = "Java class of exception to throw", required = true) + public String exceptionToThrow; + + @Argument(fullName = "failMethod", shortName = "fail", doc = "Determines which method to fail in", required = false) + public FailMethod failMethod = FailMethod.MAP; + + @Override + public Integer map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + if (ref == null) // only throw exception when we are in proper map, not special map(null) call + return null; + + if (failMethod == FailMethod.MAP) + FailMethod.fail(exceptionToThrow); + + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(final Integer value, final Integer sum) { + if (value != null && failMethod == FailMethod.REDUCE) + FailMethod.fail(exceptionToThrow); + return sum; + } + + @Override + public Integer treeReduce(final Integer lhs, final Integer rhs) { + if (failMethod == FailMethod.TREE_REDUCE) + FailMethod.fail(exceptionToThrow); + return rhs; + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestPrintReadsWalker.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestPrintReadsWalker.java new file mode 100644 index 000000000..bbf653ac1 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestPrintReadsWalker.java @@ -0,0 +1,76 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMFileWriter; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.io.NWaySAMFileWriter; +import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.utils.commandline.Output; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.sam.GATKSAMFileWriter; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +public class TestPrintReadsWalker extends ReadWalker implements NanoSchedulable { + @Output + private GATKSAMFileWriter out; + + @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false) + public boolean NO_PG_TAG = false; + + @Override + public void initialize() { + // All for the no_pg_tag. Should this be in the engine and not in the walker? + final GenomeAnalysisEngine toolkit = getToolkit(); + final SAMFileHeader outputHeader = toolkit.getSAMFileHeader().clone(); + final String PROGRAM_RECORD_NAME = "GATK PrintReads"; + final boolean preSorted = true; + if (toolkit.getArguments().BQSR_RECAL_FILE != null && !NO_PG_TAG ) { + NWaySAMFileWriter.setupWriter(out, toolkit, outputHeader, preSorted, this, PROGRAM_RECORD_NAME); + } else { + out.writeHeader(outputHeader); + out.setPresorted(preSorted); + } + } + + @Override + public GATKSAMRecord map(final ReferenceContext ref, final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker) { + return read; + } + + @Override + public SAMFileWriter reduceInit() { + return out; + } + + @Override + public SAMFileWriter reduce(final GATKSAMRecord read, final SAMFileWriter output) { + output.addAlignment(read); + return output; + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestPrintVariantsWalker.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestPrintVariantsWalker.java new file mode 100644 index 000000000..8af514693 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestPrintVariantsWalker.java @@ -0,0 +1,99 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.*; +import org.broadinstitute.gatk.engine.GATKVCFUtils; +import org.broadinstitute.gatk.engine.SampleUtils; +import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; +import org.broadinstitute.gatk.utils.commandline.Output; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.variant.ChromosomeCountConstants; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; + +import java.util.*; + +public class TestPrintVariantsWalker extends RodWalker implements TreeReducible { + @ArgumentCollection + private StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + @Argument(fullName = "fullyDecode", doc = "If true, the incoming VariantContext will be fully decoded", required = false) + private boolean fullyDecode = false; + + @Output + private VariantContextWriter vcfWriter = null; + + private Map vcfRods = null; + + @Override + public void initialize() { + vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit()); + final Set samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + final Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); + headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions)); + headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY)); + final VCFHeader vcfHeader = new VCFHeader(headerLines, samples); + vcfWriter.writeHeader(vcfHeader); + } + + @Override + public Integer map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + if (tracker == null) + return 0; + final Collection vcs = tracker.getValues(variantCollection.variants, context.getLocation()); + for (VariantContext vc : vcs) { + if (fullyDecode) + vc = vc.fullyDecode(vcfRods.get(vc.getSource()), getToolkit().lenientVCFProcessing()); + vcfWriter.add(vc); + } + return vcs.isEmpty() ? 0 : 1; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(final Integer counter, final Integer sum) { + return counter + sum; + } + + @Override + public Integer treeReduce(final Integer lhs, final Integer rhs) { + return reduce(lhs, rhs); + } + + @Override + public void onTraversalDone(final Integer sum) { + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/WalkerTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/WalkerTest.java new file mode 100644 index 000000000..ff6b1242f --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/WalkerTest.java @@ -0,0 +1,457 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import htsjdk.tribble.Tribble; +import htsjdk.tribble.index.Index; +import htsjdk.tribble.index.IndexFactory; +import htsjdk.variant.bcf2.BCF2Utils; +import htsjdk.variant.vcf.VCFCodec; +import org.apache.commons.lang.StringUtils; +import org.broadinstitute.gatk.engine.CommandLineExecutable; +import org.broadinstitute.gatk.engine.CommandLineGATK; +import org.broadinstitute.gatk.engine.crypt.CryptUtils; +import org.broadinstitute.gatk.engine.phonehome.GATKRunReport; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.MD5DB; +import org.broadinstitute.gatk.utils.MD5Mismatch; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.classloader.JVMUtils; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.exceptions.GATKException; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.testng.Assert; +import org.testng.annotations.AfterSuite; +import org.testng.annotations.BeforeMethod; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.PrintStream; +import java.text.SimpleDateFormat; +import java.util.*; + +public class WalkerTest extends BaseTest { + public static final String gatkKeyFile = CryptUtils.GATK_USER_KEY_DIRECTORY + "gsamembers_broadinstitute.org.key"; + + private static final boolean GENERATE_SHADOW_BCF = true; + private static final boolean ENABLE_PHONE_HOME_FOR_TESTS = false; + private static final boolean ENABLE_ON_THE_FLY_CHECK_FOR_VCF_INDEX = false; + private static final boolean ENABLE_AUTO_INDEX_CREATION_AND_LOCKING_FOR_TESTS = false; + + private static MD5DB md5DB = new MD5DB(); + + @BeforeMethod + public void initializeWalkerTests() { + logger.debug("Initializing walker tests"); + Utils.resetRandomGenerator(); + } + + @AfterSuite + public void finalizeWalkerTests() { + logger.debug("Finalizing walker tests"); + md5DB.close(); + } + + public static MD5DB getMd5DB() { + return md5DB; + } + + public void validateOutputBCFIfPossible(final String name, final File resultFile) { + final File bcfFile = BCF2Utils.shadowBCF(resultFile); + if ( bcfFile != null && bcfFile.exists() ) { + logger.warn("Checking shadow BCF output file " + bcfFile + " against VCF file " + resultFile); + try { + assertVCFandBCFFilesAreTheSame(resultFile, bcfFile); + logger.warn(" Shadow BCF PASSED!"); + } catch ( Exception e ) { + Assert.fail("Exception received reading shadow BCFFile " + bcfFile + " for test " + name, e); + } + } + } + + public void validateOutputIndex(final String name, final File resultFile) { + if ( !ENABLE_ON_THE_FLY_CHECK_FOR_VCF_INDEX ) + return; + + File indexFile = Tribble.indexFile(resultFile); + //System.out.println("Putative index file is " + indexFile); + if ( indexFile.exists() ) { + if ( resultFile.getAbsolutePath().contains(".vcf") ) { + // todo -- currently we only understand VCF files! Blow up since we can't test them + throw new GATKException("Found an index created for file " + resultFile + " but we can only validate VCF files. Extend this code!"); + } + + System.out.println("Verifying on-the-fly index " + indexFile + " for test " + name + " using file " + resultFile); + Index indexFromOutputFile = IndexFactory.createDynamicIndex(resultFile, new VCFCodec()); + Index dynamicIndex = IndexFactory.loadIndex(indexFile.getAbsolutePath()); + + if ( ! indexFromOutputFile.equalsIgnoreProperties(dynamicIndex) ) { + Assert.fail(String.format("Index on disk from indexing on the fly not equal to the index created after the run completed. FileIndex %s vs. on-the-fly %s%n", + indexFromOutputFile.getProperties(), + dynamicIndex.getProperties())); + } + } + } + + public List assertMatchingMD5s(final String testName, final String testClassName, List resultFiles, List expectedMD5s) { + List md5s = new ArrayList(); + List fails = new ArrayList(); + + for (int i = 0; i < resultFiles.size(); i++) { + MD5DB.MD5Match result = getMd5DB().testFileMD5(testName, testClassName, resultFiles.get(i), expectedMD5s.get(i), parameterize()); + validateOutputBCFIfPossible(testName, resultFiles.get(i)); + if ( ! result.failed ) { + validateOutputIndex(testName, resultFiles.get(i)); + md5s.add(result.expectedMD5); + } else { + fails.add(result); + } + } + + if ( ! fails.isEmpty() ) { + List actuals = new ArrayList(); + List expecteds = new ArrayList(); + List diffEngineOutputs = new ArrayList(); + + for ( final MD5DB.MD5Match fail : fails ) { + actuals.add(fail.actualMD5); + expecteds.add(fail.expectedMD5); + diffEngineOutputs.add(fail.diffEngineOutput); + logger.warn("Fail: " + fail.failMessage); + } + + final MD5Mismatch failure = new MD5Mismatch(actuals, expecteds, diffEngineOutputs); + Assert.fail(failure.toString()); + } + + return md5s; + } + + public String buildCommandLine(String... arguments) { + String cmdline = ""; + + for ( int argIndex = 0; argIndex < arguments.length; argIndex++ ) { + cmdline += arguments[argIndex]; + + if (argIndex < arguments.length - 1) { + cmdline += " "; + } + } + + return cmdline; + } + + public class WalkerTestSpec { + // Arguments implicitly included in all Walker command lines, unless explicitly + // disabled using the disableImplicitArgs() method below. + String args = ""; + int nOutputFiles = -1; + List md5s = null; + List exts = null; + Class expectedException = null; + boolean includeImplicitArgs = true; + boolean includeShadowBCF = true; + + // Name of the test class that created this test case + private Class testClass; + + // the default output path for the integration test + private File outputFileLocation = null; + + protected Map auxillaryFiles = new HashMap(); + + public WalkerTestSpec(String args, List md5s) { + this(args, -1, md5s); + } + + public WalkerTestSpec(String args, int nOutputFiles, List md5s) { + this.args = args; + this.nOutputFiles = md5s.size(); + this.md5s = md5s; + this.testClass = getCallingTestClass(); + } + + public WalkerTestSpec(String args, List exts, List md5s) { + this(args, -1, exts, md5s); + } + + public WalkerTestSpec(String args, int nOutputFiles, List exts, List md5s) { + this.args = args; + this.nOutputFiles = md5s.size(); + this.md5s = md5s; + this.exts = exts; + this.testClass = getCallingTestClass(); + } + + // @Test(expectedExceptions) doesn't work in integration tests, so use this instead + public WalkerTestSpec(String args, int nOutputFiles, Class expectedException) { + this.args = args; + this.nOutputFiles = nOutputFiles; + this.expectedException = expectedException; + this.testClass = getCallingTestClass(); + } + + private Class getCallingTestClass() { + return JVMUtils.getCallingClass(getClass()); + } + + public String getTestClassName() { + return testClass.getSimpleName(); + } + + public String getArgsWithImplicitArgs() { + String args = this.args; + if ( includeImplicitArgs ) { + args = args + (ENABLE_PHONE_HOME_FOR_TESTS ? + String.format(" -et %s ", GATKRunReport.PhoneHomeOption.AWS) : + String.format(" -et %s -K %s ", GATKRunReport.PhoneHomeOption.NO_ET, gatkKeyFile)); + if ( includeShadowBCF && GENERATE_SHADOW_BCF ) + args = args + " --generateShadowBCF "; + if ( ! ENABLE_AUTO_INDEX_CREATION_AND_LOCKING_FOR_TESTS ) + args = args + " --disable_auto_index_creation_and_locking_when_reading_rods "; + } + + return args; + } + + /** + * In the case where the input VCF files are malformed and cannot be fixed + * this function tells the engine to not try to generate a shadow BCF + * which will ultimately blow up... + */ + public void disableShadowBCF() { this.includeShadowBCF = false; } + public void setOutputFileLocation(File outputFileLocation) { + this.outputFileLocation = outputFileLocation; + } + + protected File getOutputFileLocation() { + return outputFileLocation; + } + + public boolean expectsException() { + return expectedException != null; + } + + public Class getExpectedException() { + if ( ! expectsException() ) throw new ReviewedGATKException("Tried to get expection for walker test that doesn't expect one"); + return expectedException; + } + + public void addAuxFile(String expectededMD5sum, File outputfile) { + auxillaryFiles.put(expectededMD5sum, outputfile); + } + + public void disableImplicitArgs() { + includeImplicitArgs = false; + } + } + + protected boolean parameterize() { + return false; + } + + public enum ParallelTestType { + TREE_REDUCIBLE, + NANO_SCHEDULED, + BOTH + } + + protected Pair, List> executeTestParallel(final String name, WalkerTestSpec spec, ParallelTestType testType) { + final List ntThreads = testType == ParallelTestType.TREE_REDUCIBLE || testType == ParallelTestType.BOTH ? Arrays.asList(1, 4) : Collections.emptyList(); + final List cntThreads = testType == ParallelTestType.NANO_SCHEDULED || testType == ParallelTestType.BOTH ? Arrays.asList(1, 4) : Collections.emptyList(); + + return executeTest(name, spec, ntThreads, cntThreads); + } + + protected Pair, List> executeTestParallel(final String name, WalkerTestSpec spec) { + return executeTestParallel(name, spec, ParallelTestType.TREE_REDUCIBLE); + } + + protected Pair, List> executeTest(final String name, WalkerTestSpec spec, List ntThreads, List cpuThreads) { + String originalArgs = spec.args; + Pair, List> results = null; + + boolean ran1 = false; + for ( int nt : ntThreads ) { + String extra = nt == 1 ? "" : (" -nt " + nt); + ran1 = ran1 || nt == 1; + spec.args = originalArgs + extra; + results = executeTest(name + "-nt-" + nt, spec); + } + + for ( int nct : cpuThreads ) { + if ( nct != 1 ) { + String extra = " -nct " + nct; + spec.args = originalArgs + extra; + results = executeTest(name + "-cnt-" + nct, spec); + } + } + + return results; + } + + protected Pair, List> executeTest(final String name, WalkerTestSpec spec) { + List tmpFiles = new ArrayList(); + for (int i = 0; i < spec.nOutputFiles; i++) { + String ext = spec.exts == null ? ".tmp" : "." + spec.exts.get(i); + File fl = createTempFile(String.format("walktest.tmp_param.%d", i), ext); + + // Cleanup any potential shadow BCFs on exit too, if we're generating them + if ( spec.includeShadowBCF && GENERATE_SHADOW_BCF ) { + final File potentalShadowBCFFile = BCF2Utils.shadowBCF(fl); + potentalShadowBCFFile.deleteOnExit(); + new File(potentalShadowBCFFile.getAbsolutePath() + Tribble.STANDARD_INDEX_EXTENSION).deleteOnExit(); + } + + tmpFiles.add(fl); + } + + final String args = String.format(spec.getArgsWithImplicitArgs(), tmpFiles.toArray()); + System.out.println(Utils.dupString('-', 80)); + + if ( spec.expectsException() ) { + // this branch handles the case were we are testing that a walker will fail as expected + return executeTest(name, spec.getTestClassName(), spec.getOutputFileLocation(), null, tmpFiles, args, spec.getExpectedException()); + } else { + List md5s = new LinkedList(); + md5s.addAll(spec.md5s); + + // check to see if they included any auxillary files, if so add them to the list and set them to be deleted on exit + for (String md5 : spec.auxillaryFiles.keySet()) { + md5s.add(md5); + final File auxFile = spec.auxillaryFiles.get(md5); + auxFile.deleteOnExit(); + tmpFiles.add(auxFile); + } + return executeTest(name, spec.getTestClassName(), spec.getOutputFileLocation(), md5s, tmpFiles, args, null); + } + } + + private void qcMD5s(String name, List md5s) { + final String exampleMD5 = "709a1f482cce68992c637da3cff824a8"; + for (String md5 : md5s) { + if ( md5 == null ) + throw new IllegalArgumentException("Null MD5 found in test " + name); + if ( md5.equals("") ) // ok + continue; + if ( ! StringUtils.isAlphanumeric(md5) ) + throw new IllegalArgumentException("MD5 contains non-alphanumeric characters test " + name + " md5=" + md5); + if ( md5.length() != exampleMD5.length() ) + throw new IllegalArgumentException("Non-empty MD5 of unexpected number of characters test " + name + " md5=" + md5); + } + } + + + /** + * execute the test, given the following: + * @param testName the name of the test + * @param testClassName the name of the class that contains the test + * @param md5s the list of md5s + * @param tmpFiles the temp file corresponding to the md5 list + * @param args the argument list + * @param expectedException the expected exception or null + * @return a pair of file and string lists + */ + private Pair, List> executeTest(String testName, String testClassName, File outputFileLocation, List md5s, List tmpFiles, String args, Class expectedException) { + if ( md5s != null ) qcMD5s(testName, md5s); + + if (outputFileLocation != null) + args += " -o " + outputFileLocation.getAbsolutePath(); + executeTest(testName, testClassName, args, expectedException); + + if ( expectedException != null ) { + return null; + } else { + // we need to check MD5s + return new Pair, List>(tmpFiles, assertMatchingMD5s(testName, testClassName, tmpFiles, md5s)); + } + } + + /** + * execute the test, given the following: + * @param testName the name of the test + * @param testClassName the name of the class that contains the test + * @param args the argument list + * @param expectedException the expected exception or null + */ + private void executeTest(String testName, String testClassName, String args, Class expectedException) { + CommandLineGATK instance = new CommandLineGATK(); + String[] command = Utils.escapeExpressions(args); + // run the executable + boolean gotAnException = false; + try { + final String now = new SimpleDateFormat("HH:mm:ss").format(new Date()); + final String cmdline = Utils.join(" ",command); + System.out.println(String.format("[%s] Executing test %s:%s with GATK arguments: %s", now, testClassName, testName, cmdline)); + // also write the command line to the HTML log for convenient follow-up + // do the replaceAll so paths become relative to the current + BaseTest.log(cmdline.replaceAll(publicTestDirRoot, "").replaceAll(privateTestDirRoot, "")); + CommandLineExecutable.start(instance, command); + } catch (Exception e) { + gotAnException = true; + if ( expectedException != null ) { + // we expect an exception + //System.out.println(String.format("Wanted exception %s, saw %s", expectedException, e.getClass())); + if ( expectedException.isInstance(e) ) { + // it's the type we expected + //System.out.println(String.format(" => %s PASSED", name)); + } else { + final String message = String.format("Test %s:%s expected exception %s but instead got %s with error message %s", + testClassName, testName, expectedException, e.getClass(), e.getMessage()); + if ( e.getCause() != null ) { + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + final PrintStream ps = new PrintStream(baos); + e.getCause().printStackTrace(ps); + BaseTest.log(message); + BaseTest.log(baos.toString()); + } + Assert.fail(message); + } + } else { + // we didn't expect an exception but we got one :-( + throw new RuntimeException(e); + } + } + + // catch failures from the integration test + if ( expectedException != null ) { + if ( ! gotAnException ) + // we expected an exception but didn't see it + Assert.fail(String.format("Test %s:%s expected exception %s but none was thrown", testClassName, testName, expectedException.toString())); + } else { + if ( CommandLineExecutable.result != 0) { + throw new RuntimeException("Error running the GATK with arguments: " + args); + } + } + } + + + protected File createTempFileFromBase(final String name) { + File fl = new File(name); + fl.deleteOnExit(); + return fl; + } +} diff --git a/public/gatk-queue-extensions-generator/pom.xml b/public/gatk-queue-extensions-generator/pom.xml index 296ca529f..9799191a8 100644 --- a/public/gatk-queue-extensions-generator/pom.xml +++ b/public/gatk-queue-extensions-generator/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.3 + 3.4-SNAPSHOT ../.. @@ -21,7 +21,7 @@ ${project.groupId} - gatk-tools-public + gatk-engine ${project.version} diff --git a/public/gatk-queue-extensions-generator/src/main/java/org/broadinstitute/gatk/queue/extensions/gatk/ArgumentDefinitionField.java b/public/gatk-queue-extensions-generator/src/main/java/org/broadinstitute/gatk/queue/extensions/gatk/ArgumentDefinitionField.java index 1e9e5cc45..1a6cda658 100644 --- a/public/gatk-queue-extensions-generator/src/main/java/org/broadinstitute/gatk/queue/extensions/gatk/ArgumentDefinitionField.java +++ b/public/gatk-queue-extensions-generator/src/main/java/org/broadinstitute/gatk/queue/extensions/gatk/ArgumentDefinitionField.java @@ -507,7 +507,7 @@ public abstract class ArgumentDefinitionField extends ArgumentField { @Override protected String getFreezeFields() { return String.format( ("if (%2$s != null && !org.broadinstitute.gatk.utils.io.IOUtils.isSpecialFile(%2$s))%n" + - " if (!org.broadinstitute.gatk.engine.io.stubs.VCFWriterArgumentTypeDescriptor.isCompressed(%2$s.getPath))%n" + + " if (!org.broadinstitute.gatk.utils.commandline.ArgumentTypeDescriptor.isCompressed(%2$s.getPath))%n" + " %1$s = new File(%2$s.getPath + \"%3$s\")%n"), auxFieldName, originalFieldName, Tribble.STANDARD_INDEX_EXTENSION); } diff --git a/public/gatk-queue-extensions-generator/src/main/java/org/broadinstitute/gatk/queue/extensions/gatk/GATKExtensionsGenerator.java b/public/gatk-queue-extensions-generator/src/main/java/org/broadinstitute/gatk/queue/extensions/gatk/GATKExtensionsGenerator.java index d125e3dc0..3b7f7db17 100644 --- a/public/gatk-queue-extensions-generator/src/main/java/org/broadinstitute/gatk/queue/extensions/gatk/GATKExtensionsGenerator.java +++ b/public/gatk-queue-extensions-generator/src/main/java/org/broadinstitute/gatk/queue/extensions/gatk/GATKExtensionsGenerator.java @@ -161,6 +161,8 @@ public class GATKExtensionsGenerator extends CommandLineProgram { if (scatterClass != null) { isScatter = true; constructor += String.format("scatterClass = classOf[%s]%n", scatterClass); + final boolean includeUnmapped = getUnmappedInclusion(walkerType); + constructor += String.format("setupScatterFunction = { case scatter: GATKScatterFunction => scatter.includeUnmapped = %b }%n", includeUnmapped); } writeClass(GATK_EXTENSIONS_PACKAGE_NAME + "." + clpClassName, walkerName, @@ -226,6 +228,15 @@ public class GATKExtensionsGenerator extends CommandLineProgram { return StringUtils.capitalize(partitionType.name().toLowerCase()) + "ScatterFunction"; } + /** + * Should the scatter function for this walker include unmapped reads? + * @param walkerType The walker + * @return True if unmapped reads should be processed by this walker + */ + private boolean getUnmappedInclusion(Class walkerType) { + return walkerType.getAnnotation(PartitionBy.class).includeUnmapped(); + } + /** * Writes a dynamically generated scala wrapper for a class. * @param baseClass The class to extend from. diff --git a/public/gatk-queue-extensions-public/pom.xml b/public/gatk-queue-extensions-public/pom.xml index 4d46ce170..287479a94 100644 --- a/public/gatk-queue-extensions-public/pom.xml +++ b/public/gatk-queue-extensions-public/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.3 + 3.4-SNAPSHOT ../.. @@ -39,6 +39,10 @@ log4j log4j + + com.github.broadinstitute + picard + ${project.groupId} - gatk-tools-public + gatk-utils ${project.version} test-jar test diff --git a/public/gatk-queue-extensions-public/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/CNV/ONLY_GENOTYPE_xhmmCNVpipeline.scala b/public/gatk-queue-extensions-public/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/CNV/ONLY_GENOTYPE_xhmmCNVpipeline.scala new file mode 100644 index 000000000..83379787f --- /dev/null +++ b/public/gatk-queue-extensions-public/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/CNV/ONLY_GENOTYPE_xhmmCNVpipeline.scala @@ -0,0 +1,103 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.queue.qscripts.CNV + +import org.broadinstitute.gatk.queue.extensions.gatk._ +import org.broadinstitute.gatk.queue.QScript +import org.broadinstitute.gatk.queue.util.VCF_BAM_utilities +import org.broadinstitute.gatk.queue.extensions.gatk.DoC._ +import org.broadinstitute.gatk.utils.commandline._ +import java.io.{File, PrintStream, PrintWriter} +import org.broadinstitute.gatk.utils.text.XReadLines +import collection.JavaConversions._ +import org.broadinstitute.gatk.tools.walkers.coverage.CoverageUtils +import org.broadinstitute.gatk.queue.function.scattergather.{CloneFunction, ScatterFunction, GatherFunction, ScatterGatherableFunction} +import org.broadinstitute.gatk.queue.function.{CommandLineFunction, InProcessFunction} +import org.broadinstitute.gatk.utils.io.IOUtils + +class ONLY_GENOTYPE_xhmmCNVpipeline extends QScript { + qscript => + + @Input(doc = "bam input, as as a list of .bam files, or a list of bam files with sample IDs to be used ( as specified at https://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_CommandLineGATK.html#--sample_rename_mapping_file )", shortName = "I", required = true) + var bams: File = _ + + @Input(doc = "xhmm executable file", shortName = "xhmmExec", required = true) + var xhmmExec: File = _ + + @Input(shortName = "R", doc = "ref", required = true) + var referenceFile: File = _ + + @Argument(doc = "Samples to run together for DoC, CNV discovery, and CNV genotyping. By default is set to 1 [one job per sample].", shortName = "samplesPerJob", required = false) + var samplesPerJob = 1 + + @Output(doc = "Base name for files to output", shortName = "o", required = true) + var outputBase: File = _ + + @Argument(shortName = "xhmmParams", doc = "xhmm model parameters file", required = true) + var xhmmParamsArg: File = _ + + @Argument(shortName = "genotypeParams", doc = "xhmm command-line parameters for genotyping step", required = false) + var genotypeCommandLineParams: String = "" + + @Argument(shortName = "addGenotypeRegions", doc = "Additional interval list files to be genotyped", required = false) + var addGenotypeRegions: List[File] = List[File]() + + @Argument(shortName = "longJobQueue", doc = "Job queue to run the 'long-running' commands", required = false) + var longJobQueue: String = "" + + @Argument(shortName = "filteredZscored", doc = "File of PCA-normalized read depths, after filtering and Z-score calculation", required = true) + var filteredZscored: File = _ + + @Argument(shortName = "originalSameFiltered", doc = "File of original read depths, using same filters (samples and targets) as Z-score matrix [filteredZscored argument]", required = true) + var originalSameFiltered: File = _ + + + trait LongRunTime extends CommandLineFunction { + if (longJobQueue != "") + this.jobQueue = longJobQueue + } + + def script = { + val parseMixedInputBamList = parseBamListWithOptionalSampleMappings(bams) + + val processMixedInputBamList = new ProcessBamListWithOptionalSampleMappings(parseMixedInputBamList, outputBase.getPath) + add(processMixedInputBamList) + + val samples: List[String] = parseMixedInputBamList.sampleToBams.keys.toList + Console.out.printf("Samples are %s%n", samples) + + val groups: List[Group] = buildDoCgroups(samples, parseMixedInputBamList.sampleToBams, samplesPerJob, outputBase) + for (group <- groups) { + Console.out.printf("Group is %s%n", group) + } + + for (regionsFile <- addGenotypeRegions) { + val genotypeRegions = new GenotypeCNVs(filteredZscored, regionsFile, originalSameFiltered, new File(outputBase.getParent + "/" + regionsFile.getName.replace(".interval_list", "") + "." + outputBase.getName), xhmmParamsArg, referenceFile, genotypeCommandLineParams, xhmmExec, groups) with LongRunTime + add(genotypeRegions) + } + } + +} diff --git a/public/gatk-queue-extensions-public/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/CNV/xhmmCNVpipeline.scala b/public/gatk-queue-extensions-public/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/CNV/xhmmCNVpipeline.scala index 7d2566865..d031f5d4d 100644 --- a/public/gatk-queue-extensions-public/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/CNV/xhmmCNVpipeline.scala +++ b/public/gatk-queue-extensions-public/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/CNV/xhmmCNVpipeline.scala @@ -134,6 +134,9 @@ class xhmmCNVpipeline extends QScript { @Argument(shortName = "subsegmentGenotypeThreshold", doc = "If genotypeSubsegments, this is the default genotype quality threshold for the sub-segments", required = false) var subsegmentGenotypeThreshold = 20.0 + @Argument(shortName = "addGenotypeRegions", doc = "Additional interval list files to be genotyped", required = false) + var addGenotypeRegions: List[File] = List[File]() + @Argument(shortName = "longJobQueue", doc = "Job queue to run the 'long-running' commands", required = false) var longJobQueue: String = "" @@ -314,41 +317,7 @@ class xhmmCNVpipeline extends QScript { val discover = new DiscoverCNVs(filterZscore.filteredZscored, filterOriginal.sameFiltered) add(discover) - - abstract class BaseGenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File, outName: String) extends SamplesScatterable(xhmmExec, groups) with LongRunTime { - @Input(doc = "") - val input = inputParam - - @Input(doc = "") - val xhmmParams = xhmmParamsArg - - @Input(doc = "") - val origRD = origRDParam - - @Input(doc = "") - val inXcnv = xcnv - - @Output - @Gather(classOf[MergeVCFsGatherFunction]) - val vcf: File = new File(outName) - - override def commandLine = - xhmmExec + " --genotype" + - " -p " + xhmmParams + - " -r " + input + - " -g " + inXcnv + - " -F " + referenceFile + - " -R " + origRD + - " -v " + vcf + - " " + genotypeCommandLineParams + - " " + addCommand - } - - class GenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam, outputBase.getPath + ".vcf") { - override def description = "Genotypes discovered CNVs in all samples: " + commandLine - } - - class GenotypeCNVandSubsegments(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam, outputBase.getPath + ".subsegments.vcf") { + class GenotypeCNVandSubsegments(inputParam: File, xcnv: File, origRDParam: File, xhmmParamsArg: File, referenceFile: File, genotypeCommandLineParams: String, xhmmExec: File, groups: List[Group]) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam, outputBase.getPath + ".subsegments.vcf", xhmmParamsArg, referenceFile, genotypeCommandLineParams, xhmmExec, groups) { override def commandLine = super.commandLine + " --subsegments" + @@ -358,13 +327,19 @@ class xhmmCNVpipeline extends QScript { override def description = "Genotypes discovered CNVs (and their sub-segments, of up to " + maxTargetsInSubsegment + " targets) in all samples: " + commandLine } - val genotype = new GenotypeCNVs(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered) + val genotype = new GenotypeCNVs(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered, outputBase, xhmmParamsArg, referenceFile, genotypeCommandLineParams, xhmmExec, groups) with LongRunTime add(genotype) if (genotypeSubsegments) { - val genotypeSegs = new GenotypeCNVandSubsegments(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered) + val genotypeSegs = new GenotypeCNVandSubsegments(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered, xhmmParamsArg, referenceFile, genotypeCommandLineParams, xhmmExec, groups) with LongRunTime add(genotypeSegs) } + + addGenotypeRegions :+= prepTargets.out + for (regionsFile <- addGenotypeRegions) { + val genotypeRegions = new GenotypeCNVs(filterZscore.filteredZscored, regionsFile, filterOriginal.sameFiltered, new File(outputBase.getParent + "/" + regionsFile.getName.replace(".interval_list", "") + "." + outputBase.getName), xhmmParamsArg, referenceFile, genotypeCommandLineParams, xhmmExec, groups) with LongRunTime + add(genotypeRegions) + } } class ExcludeTargetsBasedOnValue(locus_valueIn : File, outSuffix : String, minVal : Double, maxVal : Double) extends InProcessFunction { @@ -531,85 +506,3 @@ class xhmmCNVpipeline extends QScript { override def description = "Filters original read-depth data to be the same as filtered, normalized data: " + command } } - - -abstract class SamplesScatterable(val xhmmExec: File, val groups: List[Group]) extends ScatterGatherableFunction with CommandLineFunction { - this.scatterCount = groups.size - this.scatterClass = classOf[SamplesScatterFunction] - - @Input(doc = "", required=false) - var keepSampleIDs: Option[String] = None - - def addCommand = if (keepSampleIDs.isDefined) ("--keepSampleIDs " + keepSampleIDs.get) else "" -} - -class SamplesScatterFunction extends ScatterFunction with InProcessFunction { - protected var groups: List[Group] = _ - override def scatterCount = groups.size - - @Output(doc="Scatter function outputs") - var scatterSamples: Seq[File] = Nil - - override def init() { - this.groups = this.originalFunction.asInstanceOf[SamplesScatterable].groups - } - - override def bindCloneInputs(cloneFunction: CloneFunction, index: Int) { - val scatterPart = IOUtils.absolute(cloneFunction.commandDirectory, "keepSampleIDs.txt") - cloneFunction.setFieldValue("keepSampleIDs", Some(scatterPart)) - this.scatterSamples :+= scatterPart - } - - override def run() { - if (groups.size != this.scatterSamples.size) - throw new Exception("Internal inconsistency error in scattering jobs") - - (groups, this.scatterSamples).zipped foreach { - (group, sampsFile) => { - val sampsWriter = new PrintWriter(new PrintStream(sampsFile)) - - for (samp <- group.samples) { - try { - sampsWriter.printf("%s%n", samp) - } - catch { - case e: Exception => throw e - } - } - sampsWriter.close - } - } - } -} - -trait MergeVCFs extends CommandLineFunction { - var xhmmExec: File = _ - - @Input(doc = "") - var inputVCFs: List[File] = Nil - - @Output - var mergedVCF: File = null - - override def commandLine = - xhmmExec + " --mergeVCFs" + - inputVCFs.map(input => " --mergeVCF " + input).reduceLeft(_ + "" + _) + - " -v " + mergedVCF - - override def description = "Combines VCF outputs for multiple samples (at same loci): " + commandLine -} - -class MergeVCFsGatherFunction extends MergeVCFs with GatherFunction { - override def freezeFieldValues() { - super.freezeFieldValues() - - this.xhmmExec = originalFunction.asInstanceOf[SamplesScatterable].xhmmExec - - this.inputVCFs = this.gatherParts.toList - this.mergedVCF = this.originalOutput - } -} - -class DummyGatherFunction extends InProcessFunction with GatherFunction { - override def run() {} -} diff --git a/public/gatk-queue-extensions-public/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/GATKResourcesBundle.scala b/public/gatk-queue-extensions-public/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/GATKResourcesBundle.scala index d89e6058a..ccc0cb56a 100644 --- a/public/gatk-queue-extensions-public/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/GATKResourcesBundle.scala +++ b/public/gatk-queue-extensions-public/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/GATKResourcesBundle.scala @@ -167,6 +167,9 @@ class GATKResourcesBundle extends QScript { addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/GoldStandardIndel/gold.standard.indel.MillsAnd1000G.b37.vcf", "Mills_and_1000G_gold_standard.indels", b37, true, false)) + addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Affymetrix_Axiom/Axiom_Exome_Plus.sites_only.all_populations.poly.vcf", + "Axiom_Exome_Plus.sites_only.all_populations.poly", b37, true, false)) + // // CEU trio (NA12878,NA12891,NA12892) best practices results // diff --git a/public/gatk-queue-extensions-public/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/examples/ExamplePrintReads.scala b/public/gatk-queue-extensions-public/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/examples/ExamplePrintReads.scala index 440b6f288..f3c4bf382 100644 --- a/public/gatk-queue-extensions-public/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/examples/ExamplePrintReads.scala +++ b/public/gatk-queue-extensions-public/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/examples/ExamplePrintReads.scala @@ -41,6 +41,9 @@ class ExamplePrintReads extends QScript { @Output(doc="Bam output", shortName="out") var outFile: File = _ + @Argument(doc="One or more genomic intervals over which to operate", shortName="L", required=false) + var intervals: Seq[String] = Nil + def script() { val printReads = new PrintReads printReads.reference_sequence = referenceFile @@ -48,6 +51,7 @@ class ExamplePrintReads extends QScript { printReads.scatterCount = 3 printReads.input_file :+= bamFile printReads.out = outFile + printReads.intervalsString = intervals add(printReads) } } diff --git a/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/cancer/MuTect.scala b/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/cancer/MuTect.scala index 36031d948..f116af51a 100644 --- a/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/cancer/MuTect.scala +++ b/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/cancer/MuTect.scala @@ -32,6 +32,7 @@ import org.broadinstitute.gatk.utils.commandline.Input import org.broadinstitute.gatk.utils.commandline.Output import org.broadinstitute.gatk.queue.function.scattergather.ScatterGatherableFunction import org.broadinstitute.gatk.queue.extensions.gatk.{TaggedFile, VcfGatherFunction, LocusScatterFunction} +import org.broadinstitute.gatk.utils.commandline.ArgumentTypeDescriptor.isCompressed class MuTect extends org.broadinstitute.gatk.queue.extensions.gatk.CommandLineGATK with ScatterGatherableFunction { analysisName = "MuTect" @@ -409,7 +410,7 @@ class MuTect extends org.broadinstitute.gatk.queue.extensions.gatk.CommandLineGA override def freezeFieldValues() { super.freezeFieldValues() if (vcf != null && !org.broadinstitute.gatk.utils.io.IOUtils.isSpecialFile(vcf)) - if (!org.broadinstitute.gatk.engine.io.stubs.VCFWriterArgumentTypeDescriptor.isCompressed(vcf.getPath)) + if (!org.broadinstitute.gatk.utils.commandline.ArgumentTypeDescriptor.isCompressed(vcf.getPath)) vcfIndex = new File(vcf.getPath + ".idx") dbsnpIndexes ++= dbsnp.filter(orig => orig != null).map(orig => new File(orig.getPath + ".idx")) cosmicIndexes ++= cosmic.filter(orig => orig != null).map(orig => new File(orig.getPath + ".idx")) diff --git a/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/ContigScatterFunction.scala b/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/ContigScatterFunction.scala index 7eff64518..e1da454f5 100644 --- a/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/ContigScatterFunction.scala +++ b/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/ContigScatterFunction.scala @@ -33,8 +33,6 @@ import org.broadinstitute.gatk.queue.function.InProcessFunction * Splits intervals by contig instead of evenly. */ class ContigScatterFunction extends GATKScatterFunction with InProcessFunction { - // Include unmapped reads by default. - this.includeUnmapped = true override def scatterCount = if (intervalFilesExist) super.scatterCount min this.maxIntervals else super.scatterCount diff --git a/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/DoC/package.scala b/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/DoC/package.scala index fd54be631..fc999d04a 100644 --- a/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/DoC/package.scala +++ b/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/DoC/package.scala @@ -27,7 +27,6 @@ package org.broadinstitute.gatk.queue.extensions.gatk import java.io.{PrintStream, PrintWriter, File} import org.broadinstitute.gatk.queue.function.scattergather.ScatterGatherableFunction -import org.broadinstitute.gatk.engine.downsampling.DownsampleType import org.broadinstitute.gatk.utils.commandline.{Input, Gather, Output} import org.broadinstitute.gatk.queue.function.{InProcessFunction, CommandLineFunction} import org.broadinstitute.gatk.tools.walkers.coverage.CoverageUtils @@ -35,11 +34,13 @@ import scala.collection.JavaConversions._ import scala.Some import org.broadinstitute.gatk.utils.text.XReadLines import org.broadinstitute.gatk.queue.util.VCF_BAM_utilities +import org.broadinstitute.gatk.utils.downsampling.DownsampleType // Minimal refactor from a package object to a file full of classes/objects // due to ongoing bugs with inner classes/objects in package objects: // https://issues.scala-lang.org/browse/SI-4344 // https://issues.scala-lang.org/browse/SI-5954 + class DoC(val bams: List[File], val DoC_output: File, val countType: CoverageUtils.CountPileupType, val MAX_DEPTH: Int, val minMappingQuality: Int, val minBaseQuality: Int, val scatterCountInput: Int, val START_BIN: Int, val NUM_BINS: Int, val minCoverageCalcs: Seq[Int], val sampleRenameMappingFile: Option[File] = None) extends CommandLineGATK with ScatterGatherableFunction { val DOC_OUTPUT_SUFFIX: String = ".sample_interval_summary" diff --git a/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/GATKIntervals.scala b/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/GATKIntervals.scala index 0e568b61f..c1d71e281 100644 --- a/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/GATKIntervals.scala +++ b/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/GATKIntervals.scala @@ -59,7 +59,7 @@ case class GATKIntervals(reference: File, intervals: Seq[File], intervalsString: this.excludeIntervalsString.map(GATKIntervals.createBinding(_, "excludeIntervalsString")) IntervalUtils.parseIntervalBindings( - referenceDataSource, + referenceDataSource.getReference, includeIntervalBindings, intervalSetRule, intervalMergingRule, intervalPadding.getOrElse(0), excludeIntervalBindings).toList diff --git a/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/GATKScatterFunction.scala b/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/GATKScatterFunction.scala index 01075c393..12fea171b 100644 --- a/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/GATKScatterFunction.scala +++ b/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/GATKScatterFunction.scala @@ -48,7 +48,7 @@ trait GATKScatterFunction extends ScatterFunction { protected var originalGATK: CommandLineGATK = _ /** Whether the last scatter job should also include any unmapped reads. */ - protected var includeUnmapped: Boolean = _ + var includeUnmapped: Boolean = _ override def init() { this.originalGATK = this.originalFunction.asInstanceOf[CommandLineGATK] diff --git a/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/XHMM/package.scala b/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/XHMM/package.scala new file mode 100644 index 000000000..36fcdc74d --- /dev/null +++ b/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/XHMM/package.scala @@ -0,0 +1,156 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.queue.extensions.gatk + +import org.broadinstitute.gatk.queue.extensions.gatk._ +import org.broadinstitute.gatk.queue.QScript +import org.broadinstitute.gatk.queue.extensions.gatk.DoC._ +import org.broadinstitute.gatk.utils.commandline._ +import java.io.{File, PrintStream, PrintWriter} +import collection.JavaConversions._ +import org.broadinstitute.gatk.queue.function.scattergather.{CloneFunction, ScatterFunction, GatherFunction, ScatterGatherableFunction} +import org.broadinstitute.gatk.queue.function.{CommandLineFunction, InProcessFunction} +import org.broadinstitute.gatk.utils.io.IOUtils + +// Minimal refactor from a package object to a file full of classes/objects +// due to ongoing bugs with inner classes/objects in package objects: +// https://issues.scala-lang.org/browse/SI-4344 +// https://issues.scala-lang.org/browse/SI-5954 + + abstract class BaseGenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File, outName: String, xhmmParamsArg: File, referenceFile: File, genotypeCommandLineParams: String, xhmmExec: File, groups: List[Group]) extends SamplesScatterable(xhmmExec, groups) { + @Input(doc = "") + val input = inputParam + + @Input(doc = "") + val xhmmParams = xhmmParamsArg + + @Input(doc = "") + val origRD = origRDParam + + @Input(doc = "") + val inXcnv = xcnv + + @Output + @Gather(classOf[MergeVCFsGatherFunction]) + val vcf: File = new File(outName) + + override def commandLine = + xhmmExec + " --genotype" + + " -p " + xhmmParams + + " -r " + input + + " -g " + inXcnv + + " -F " + referenceFile + + " -R " + origRD + + " -v " + vcf + + " " + genotypeCommandLineParams + + " " + addCommand + } + + class GenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File, genotypeOutputBase: File, xhmmParamsArg: File, referenceFile: File, genotypeCommandLineParams: String, xhmmExec: File, groups: List[Group]) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam, genotypeOutputBase.getPath + ".vcf", xhmmParamsArg, referenceFile, genotypeCommandLineParams, xhmmExec, groups) { + override def description = "Genotypes CNV regions in all samples: " + commandLine + } + + +abstract class SamplesScatterable(val xhmmExec: File, val groups: List[Group]) extends ScatterGatherableFunction with CommandLineFunction { + this.scatterCount = groups.size + this.scatterClass = classOf[SamplesScatterFunction] + + @Input(doc = "", required=false) + var keepSampleIDs: Option[String] = None + + def addCommand = if (keepSampleIDs.isDefined) ("--keepSampleIDs " + keepSampleIDs.get) else "" +} + +class SamplesScatterFunction extends ScatterFunction with InProcessFunction { + protected var groups: List[Group] = _ + override def scatterCount = groups.size + + @Output(doc="Scatter function outputs") + var scatterSamples: Seq[File] = Nil + + override def init() { + this.groups = this.originalFunction.asInstanceOf[SamplesScatterable].groups + } + + override def bindCloneInputs(cloneFunction: CloneFunction, index: Int) { + val scatterPart = IOUtils.absolute(cloneFunction.commandDirectory, "keepSampleIDs.txt") + cloneFunction.setFieldValue("keepSampleIDs", Some(scatterPart)) + this.scatterSamples :+= scatterPart + } + + override def run() { + if (groups.size != this.scatterSamples.size) + throw new Exception("Internal inconsistency error in scattering jobs") + + (groups, this.scatterSamples).zipped foreach { + (group, sampsFile) => { + val sampsWriter = new PrintWriter(new PrintStream(sampsFile)) + + for (samp <- group.samples) { + try { + sampsWriter.printf("%s%n", samp) + } + catch { + case e: Exception => throw e + } + } + sampsWriter.close + } + } + } +} + +trait MergeVCFs extends CommandLineFunction { + var xhmmExec: File = _ + + @Input(doc = "") + var inputVCFs: List[File] = Nil + + @Output + var mergedVCF: File = null + + override def commandLine = + xhmmExec + " --mergeVCFs" + + inputVCFs.map(input => " --mergeVCF " + input).reduceLeft(_ + "" + _) + + " -v " + mergedVCF + + override def description = "Combines VCF outputs for multiple samples (at same loci): " + commandLine +} + +class MergeVCFsGatherFunction extends MergeVCFs with GatherFunction { + override def freezeFieldValues() { + super.freezeFieldValues() + + this.xhmmExec = originalFunction.asInstanceOf[SamplesScatterable].xhmmExec + + this.inputVCFs = this.gatherParts.toList + this.mergedVCF = this.originalOutput + } +} + +class DummyGatherFunction extends InProcessFunction with GatherFunction { + override def run() {} +} diff --git a/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/picard/MarkDuplicates.scala b/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/picard/MarkDuplicates.scala index 66460b688..fe7739093 100644 --- a/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/picard/MarkDuplicates.scala +++ b/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/picard/MarkDuplicates.scala @@ -37,7 +37,7 @@ import java.io.File */ class MarkDuplicates extends org.broadinstitute.gatk.queue.function.JavaCommandLineFunction with PicardBamFunction { analysisName = "MarkDuplicates" - javaMainClass = "picard.sam.MarkDuplicates" + javaMainClass = "picard.sam.markduplicates.MarkDuplicates" @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) var input: Seq[File] = Nil @@ -55,7 +55,7 @@ class MarkDuplicates extends org.broadinstitute.gatk.queue.function.JavaCommandL var REMOVE_DUPLICATES: Boolean = false @Argument(doc = "Maximum number of file handles to keep open when spilling read ends to disk. Set this number a little lower than the per-process maximum number of file that may be open. This number can be found by executing the 'ulimit -n' command on a Unix system.", shortName = "max_file_handles", fullName ="max_file_handles_for_read_ends_maps", required=false) - var MAX_FILE_HANDLES_FOR_READ_ENDS_MAP: Int = -1; + var MAX_FILE_HANDLES_FOR_READ_ENDS_MAP: Int = -1 @Argument(doc = "This number, plus the maximum RAM available to the JVM, determine the memory footprint used by some of the sorting collections. If you are running out of memory, try reducing this number.", shortName = "sorting_ratio", fullName = "sorting_collection_size_ratio", required = false) var SORTING_COLLECTION_SIZE_RATIO: Double = -1 diff --git a/public/gatk-queue-extensions-public/src/test/scala/org/broadinstitute/gatk/queue/extensions/gatk/QueueFeaturesQueueTest.scala b/public/gatk-queue-extensions-public/src/test/scala/org/broadinstitute/gatk/queue/extensions/gatk/QueueFeaturesQueueTest.scala new file mode 100644 index 000000000..f1db69e1c --- /dev/null +++ b/public/gatk-queue-extensions-public/src/test/scala/org/broadinstitute/gatk/queue/extensions/gatk/QueueFeaturesQueueTest.scala @@ -0,0 +1,63 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.queue.extensions.gatk + +import org.broadinstitute.gatk.queue.pipeline.{QueueTest, QueueTestSpec} +import org.broadinstitute.gatk.utils.BaseTest +import org.testng.annotations.Test + +class QueueFeaturesQueueTest { + + @Test(timeOut=36000000) + def testIncludeUnmapped(): Unit = { + + //First case: When no intervals are specified, unmapped reads should be included + val testOut = "withunmapped.bam" + val spec = new QueueTestSpec + spec.name = "includeUnmapped" + spec.args = Array( + " -S " + QueueTest.publicQScriptsPackageDir + "examples/ExamplePrintReads.scala", + " -R " + BaseTest.publicTestDir + "exampleFASTA.fasta", + " -I " + BaseTest.publicTestDir + "exampleBAM_with_unmapped.bam", + " -out " + testOut).mkString + spec.fileMD5s += testOut -> "c7f086293509b1c506f7a25b13754637" + QueueTest.executeTest(spec) + + //Second case: When intervals are explicitly provided, unmapped reads should not be included + val testOut2 = "withoutunmapped.bam" + val spec2 = new QueueTestSpec + spec2.name = "excludeUnmapped" + spec2.args = Array( + " -S " + QueueTest.publicQScriptsPackageDir + "examples/ExamplePrintReads.scala", + " -R " + BaseTest.publicTestDir + "exampleFASTA.fasta", + " -I " + BaseTest.publicTestDir + "exampleBAM_with_unmapped.bam", + " -L chr1", + " -out " + testOut2).mkString + spec2.fileMD5s += testOut2 -> "44bda07e3421a79c56213900ad3f7d7c" + QueueTest.executeTest(spec2) + } + +} diff --git a/public/gatk-queue-extensions-public/src/test/scala/org/broadinstitute/gatk/queue/pipeline/examples/HelloWorldQueueTest.scala b/public/gatk-queue-extensions-public/src/test/scala/org/broadinstitute/gatk/queue/pipeline/examples/HelloWorldQueueTest.scala index 093050aa9..cef77f961 100644 --- a/public/gatk-queue-extensions-public/src/test/scala/org/broadinstitute/gatk/queue/pipeline/examples/HelloWorldQueueTest.scala +++ b/public/gatk-queue-extensions-public/src/test/scala/org/broadinstitute/gatk/queue/pipeline/examples/HelloWorldQueueTest.scala @@ -68,7 +68,7 @@ class HelloWorldQueueTest { QueueTest.executeTest(spec) } - @Test(timeOut=36000000) + @Test(enabled=false, timeOut=36000000) def testHelloWorldWithLsfResource() { val spec = new QueueTestSpec spec.name = "HelloWorldWithLsfResource" @@ -78,7 +78,7 @@ class HelloWorldQueueTest { QueueTest.executeTest(spec) } - @Test(timeOut=36000000) + @Test(enabled=false, timeOut=36000000) def testHelloWorldWithLsfResourceAndMemoryLimit() { val spec = new QueueTestSpec spec.name = "HelloWorldWithLsfResourceAndMemoryLimit" @@ -88,7 +88,7 @@ class HelloWorldQueueTest { QueueTest.executeTest(spec) } - @Test(timeOut=36000000) + @Test(enabled=false, timeOut=36000000) def testHelloWorldWithLsfEnvironment() { val spec = new QueueTestSpec spec.name = "HelloWorldWithLsfEnvironment" diff --git a/public/gatk-queue/pom.xml b/public/gatk-queue/pom.xml index 05ce20708..525f0b59a 100644 --- a/public/gatk-queue/pom.xml +++ b/public/gatk-queue/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.3 + 3.4-SNAPSHOT ../.. @@ -21,7 +21,7 @@ ${project.groupId} - gatk-tools-public + gatk-utils ${project.version} @@ -47,7 +47,7 @@ ${project.groupId} - gatk-tools-public + gatk-utils ${project.version} test-jar test diff --git a/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/QCommandLine.scala b/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/QCommandLine.scala index 297e10bb3..843743e48 100644 --- a/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/QCommandLine.scala +++ b/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/QCommandLine.scala @@ -34,8 +34,7 @@ import org.broadinstitute.gatk.utils.classloader.PluginManager import org.broadinstitute.gatk.utils.exceptions.UserException import org.broadinstitute.gatk.utils.io.IOUtils import org.broadinstitute.gatk.utils.help.ApplicationDetails -import java.util.{ResourceBundle, Arrays} -import org.broadinstitute.gatk.utils.text.TextFormattingUtils +import java.util.Arrays import org.apache.commons.io.FilenameUtils /** @@ -260,33 +259,11 @@ class QCommandLine extends CommandLineProgram with Logging { } private def createQueueHeader() : Seq[String] = { - Seq(String.format("Queue v%s, Compiled %s", getQueueVersion, getBuildTimestamp), + Seq(String.format("Queue v%s, Compiled %s", CommandLineProgram.getVersionNumber, CommandLineProgram.getBuildTime), "Copyright (c) 2012 The Broad Institute", "For support and documentation go to http://www.broadinstitute.org/gatk") } - private def getQueueVersion : String = { - val stingResources : ResourceBundle = TextFormattingUtils.loadResourceBundle("GATKText") - - if ( stingResources.containsKey("org.broadinstitute.gatk.queue.QueueVersion.version") ) { - stingResources.getString("org.broadinstitute.gatk.queue.QueueVersion.version") - } - else { - "" - } - } - - private def getBuildTimestamp : String = { - val stingResources : ResourceBundle = TextFormattingUtils.loadResourceBundle("GATKText") - - if ( stingResources.containsKey("build.timestamp") ) { - stingResources.getString("build.timestamp") - } - else { - "" - } - } - def shutdown() = { shuttingDown = true qGraph.shutdown() diff --git a/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/QSettings.scala b/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/QSettings.scala index 86457fb49..7574518ef 100644 --- a/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/QSettings.scala +++ b/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/QSettings.scala @@ -98,4 +98,9 @@ class QSettings { @Argument(fullName="log_directory", shortName="logDir", doc="Directory to write log files into.", required=false) var logDirectory: File = _ + + /** + * If set, use Broad-specific cluster settings in the GridEngine job runner. Activated via the -qsub-broad argument in QGraphSettings. + */ + var useBroadClusterSettings: Boolean = false } diff --git a/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/engine/QGraph.scala b/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/engine/QGraph.scala index 7d09bf561..31cdef904 100644 --- a/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/engine/QGraph.scala +++ b/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/engine/QGraph.scala @@ -395,12 +395,18 @@ class QGraph extends Logging { */ private def runJobs() { try { - if (settings.bsub) + if (settings.bsub) { settings.jobRunner = "Lsf706" - else if (settings.qsub) + } + else if (settings.qsub || settings.qsubBroad) { settings.jobRunner = "GridEngine" - else if (settings.jobRunner == null) + if ( settings.qsubBroad ) { + settings.qSettings.useBroadClusterSettings = true + } + } + else if (settings.jobRunner == null) { settings.jobRunner = "Shell" + } commandLineManager = commandLinePluginManager.createByName(settings.jobRunner) for (mgr <- managers) { diff --git a/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/engine/QGraphSettings.scala b/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/engine/QGraphSettings.scala index 49dace949..7f5177101 100644 --- a/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/engine/QGraphSettings.scala +++ b/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/engine/QGraphSettings.scala @@ -46,6 +46,9 @@ class QGraphSettings { @Argument(fullName="qsub", shortName="qsub", doc="Equivalent to -jobRunner GridEngine", required=false) var qsub = false + @Argument(fullName="qsub-broad", shortName="qsub-broad", doc="Equivalent to -qsub, but uses GridEngine parameters specific to the Broad GridEngine cluster", required=false) + var qsubBroad = false + @Argument(fullName="status",shortName="status",doc="Get status of jobs for the qscript",required=false) var getStatus = false diff --git a/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/engine/gridengine/GridEngineJobRunner.scala b/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/engine/gridengine/GridEngineJobRunner.scala index b21f43b17..8ddda6c97 100644 --- a/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/engine/gridengine/GridEngineJobRunner.scala +++ b/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/engine/gridengine/GridEngineJobRunner.scala @@ -58,8 +58,14 @@ class GridEngineJobRunner(session: Session, function: CommandLineFunction) exten nativeSpec += " -l %s=%dM".format(function.qSettings.residentRequestParameter, function.residentRequest.map(_ * 1024).get.ceil.toInt) // If the resident set size limit is defined specify the memory limit - if (function.residentLimit.isDefined) - nativeSpec += " -l h_rss=%dM".format(function.residentLimit.map(_ * 1024).get.ceil.toInt) + if (function.residentLimit.isDefined) { + var memoryLimitParameter : String = "h_rss" + if (function.qSettings.useBroadClusterSettings) { + memoryLimitParameter = "h_vmem" + } + + nativeSpec += " -l %s=%dM".format(memoryLimitParameter, function.residentLimit.map(_ * 1024).get.ceil.toInt) + } // If more than 1 core is requested, set the proper request // if we aren't being jerks and just stealing cores (previous behavior) @@ -82,7 +88,7 @@ class GridEngineJobRunner(session: Session, function: CommandLineFunction) exten if (priority.isDefined) nativeSpec += " -p " + priority.get - logger.debug("Native spec is: %s".format(nativeSpec)) + logger.info("Native spec is: %s".format(nativeSpec)) (nativeSpec + " " + super.functionNativeSpec).trim() } } diff --git a/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/QJobReport.scala b/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/QJobReport.scala index be5a17f43..ddc11eb34 100644 --- a/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/QJobReport.scala +++ b/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/QJobReport.scala @@ -26,8 +26,8 @@ package org.broadinstitute.gatk.queue.util import org.broadinstitute.gatk.queue.function.QFunction -import org.broadinstitute.gatk.engine.report.GATKReportTable import org.broadinstitute.gatk.queue.engine.JobRunInfo +import org.broadinstitute.gatk.utils.report.GATKReportTable /** * A mixin to add Job info to the class diff --git a/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/QJobsReporter.scala b/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/QJobsReporter.scala index b3b0b33c8..082062364 100644 --- a/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/QJobsReporter.scala +++ b/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/QJobsReporter.scala @@ -30,9 +30,10 @@ import org.broadinstitute.gatk.utils.io.{Resource} import org.broadinstitute.gatk.queue.engine.{JobRunInfo, QGraph} import org.broadinstitute.gatk.queue.function.QFunction import org.broadinstitute.gatk.utils.R.{RScriptLibrary, RScriptExecutor} -import org.broadinstitute.gatk.engine.report.{GATKReportTable, GATKReport} +import org.broadinstitute.gatk.utils.report.GATKReportTable import org.broadinstitute.gatk.utils.exceptions.UserException import org.apache.commons.io.{FileUtils, IOUtils} +import org.broadinstitute.gatk.utils.report.{GATKReportTable, GATKReport} /** * Writes out RunInfo to a GATKReport diff --git a/public/gatk-queue/src/test/scala/org/broadinstitute/gatk/queue/pipeline/QueueTest.scala b/public/gatk-queue/src/test/scala/org/broadinstitute/gatk/queue/pipeline/QueueTest.scala index 62ac8e1fe..024fcb6f1 100644 --- a/public/gatk-queue/src/test/scala/org/broadinstitute/gatk/queue/pipeline/QueueTest.scala +++ b/public/gatk-queue/src/test/scala/org/broadinstitute/gatk/queue/pipeline/QueueTest.scala @@ -35,9 +35,9 @@ import org.broadinstitute.gatk.utils.MD5DB import org.broadinstitute.gatk.queue.{QScript, QCommandLine} import org.broadinstitute.gatk.queue.util.Logging import java.io.{FilenameFilter, File} -import org.broadinstitute.gatk.engine.report.GATKReport import org.apache.commons.io.FileUtils import org.apache.commons.io.filefilter.WildcardFileFilter +import org.broadinstitute.gatk.utils.report.GATKReport object QueueTest extends BaseTest with Logging { @@ -53,12 +53,12 @@ object QueueTest extends BaseTest with Logging { /** * All the job runners configured to run QueueTests at The Broad. */ - final val allJobRunners = Seq("Lsf706", "GridEngine", "Shell") + final val allJobRunners = Seq("GridEngine", "Shell") /** * The default job runners to run. */ - final val defaultJobRunners = Seq("Lsf706", "GridEngine") + final val defaultJobRunners = Seq("GridEngine") /** * Returns the top level output path to this test. diff --git a/public/gatk-root/pom.xml b/public/gatk-root/pom.xml index c9f251761..2f3a01b44 100644 --- a/public/gatk-root/pom.xml +++ b/public/gatk-root/pom.xml @@ -12,7 +12,7 @@ org.broadinstitute.gatk gatk-root - 3.3 + 3.4-SNAPSHOT pom GATK Root @@ -44,8 +44,8 @@ org.testng.reporters.FailedReporter,org.testng.reporters.JUnitXMLReporter,org.broadinstitute.gatk.utils.TestNGTestTransformer,org.broadinstitute.gatk.utils.GATKTextReporter,org.uncommons.reportng.HTMLReporter - 1.120.1620 - 1.120.1579 + 1.132 + 1.131 @@ -67,7 +67,7 @@ 1.0-r139 - samtools + com.github.samtools htsjdk ${htsjdk.version} @@ -82,7 +82,7 @@ - picard + com.github.broadinstitute picard ${picard.version} @@ -130,10 +130,16 @@ reflections 0.9.9-RC1 + + + org.slf4j + slf4j-api + 1.6.1 + org.slf4j slf4j-log4j12 - 1.6.1 + 1.7.5 gov.nist.math @@ -404,6 +410,7 @@ ${gatk.basedir} diff true + ${gatk.queuetests.run} ${java.io.tmpdir} @@ -458,8 +465,7 @@ ${gatk.basedir} diff true - - ${gatk.queuetests.run} + ${gatk.queuetests.run} ${java.io.tmpdir} diff --git a/public/gatk-tools-public/pom.xml b/public/gatk-tools-public/pom.xml index 5386b3241..cbf26bb41 100644 --- a/public/gatk-tools-public/pom.xml +++ b/public/gatk-tools-public/pom.xml @@ -5,7 +5,7 @@ org.broadinstitute.gatk gatk-aggregator - 3.3 + 3.4-SNAPSHOT ../.. @@ -24,6 +24,26 @@ gatk-engine ${project.version} + + org.apache.commons + commons-jexl + + + + ${project.groupId} + gatk-utils + ${project.version} + test-jar + test + + + + ${project.groupId} + gatk-engine + ${project.version} + test-jar + test + com.google.caliper @@ -34,16 +54,6 @@ - - org.apache.maven.plugins - maven-resources-plugin - - - copy-resource-bundle-log4j - prepare-package - - - org.apache.maven.plugins maven-javadoc-plugin diff --git a/public/gatk-tools-public/src/main/java/htsjdk/samtools/GATKBin.java b/public/gatk-tools-public/src/main/java/htsjdk/samtools/GATKBin.java deleted file mode 100644 index d1e689d63..000000000 --- a/public/gatk-tools-public/src/main/java/htsjdk/samtools/GATKBin.java +++ /dev/null @@ -1,135 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package htsjdk.samtools; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -/** - * A temporary solution to work around Java access rights issues: - * override GATKBin and make it public. - * TODO: Eliminate once we determine the final fate of the BAM index reading code. - */ -public class GATKBin implements Comparable { - /** - * The reference sequence associated with this bin. - */ - private final int referenceSequence; - - /** - * The number of this bin within the BAM file. - */ - private final int binNumber; - - /** - * The chunks associated with this bin. - */ - private GATKChunk[] chunkList; - - public GATKBin(Bin bin) { - this(bin.getReferenceSequence(),bin.getBinNumber()); - } - - public GATKBin(final int referenceSequence, final int binNumber) { - this.referenceSequence = referenceSequence; - this.binNumber = binNumber; - } - - public int getReferenceSequence() { - return referenceSequence; - } - - public int getBinNumber() { - return binNumber; - } - - /** - * Convert this GATKBin to a normal bin, for processing with the standard BAM query interface. - * @return - */ - public Bin toBin() { - return new Bin(referenceSequence,binNumber); - } - - /** - * See whether two bins are equal. If the ref seq and the bin number - * are equal, assume equality of the chunk list. - * @param other The other Bin to which to compare this. - * @return True if the two bins are equal. False otherwise. - */ - @Override - public boolean equals(Object other) { - if(other == null) return false; - if(!(other instanceof GATKBin)) return false; - - GATKBin otherBin = (GATKBin)other; - return this.referenceSequence == otherBin.referenceSequence && this.binNumber == otherBin.binNumber; - } - - /** - * Compute a unique hash code for the given reference sequence and bin number. - * @return A unique hash code. - */ - @Override - public int hashCode() { - return ((Integer)referenceSequence).hashCode() ^ ((Integer)binNumber).hashCode(); - } - - /** - * Compare two bins to see what ordering they should appear in. - * @param other Other bin to which this bin should be compared. - * @return -1 if this < other, 0 if this == other, 1 if this > other. - */ - public int compareTo(GATKBin other) { - if(other == null) - throw new ClassCastException("Cannot compare to a null object"); - - // Check the reference sequences first. - if(this.referenceSequence != other.referenceSequence) - return referenceSequence - other.referenceSequence; - - // Then check the bin ordering. - return binNumber - other.binNumber; - } - - /** - * Sets the chunks associated with this bin - */ - public void setChunkList(GATKChunk[] list){ - chunkList = list; - } - - /** - * Gets the list of chunks associated with this bin. - * @return the chunks in this bin. If no chunks are associated, an empty list will be returned. - */ - public GATKChunk[] getChunkList(){ - if(chunkList == null) - return new GATKChunk[0]; - return chunkList; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/CommandLineExecutable.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/CommandLineExecutable.java deleted file mode 100644 index b8221bb16..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/CommandLineExecutable.java +++ /dev/null @@ -1,229 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine; - -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.commandline.ArgumentTypeDescriptor; -import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; -import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; -import org.broadinstitute.gatk.engine.datasources.reads.SAMReaderID; -import org.broadinstitute.gatk.engine.filters.ReadFilter; -import org.broadinstitute.gatk.engine.io.stubs.OutputStreamArgumentTypeDescriptor; -import org.broadinstitute.gatk.engine.io.stubs.SAMFileWriterArgumentTypeDescriptor; -import org.broadinstitute.gatk.engine.io.stubs.VCFWriterArgumentTypeDescriptor; -import org.broadinstitute.gatk.engine.phonehome.GATKRunReport; -import org.broadinstitute.gatk.engine.refdata.utils.RMDTriplet; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.utils.crypt.CryptUtils; -import org.broadinstitute.gatk.utils.crypt.GATKKey; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.text.ListFileUtils; - -import java.security.PublicKey; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; - -/** - * @author aaron - */ -public abstract class CommandLineExecutable extends CommandLineProgram { - /** - * The actual engine which performs the analysis. - */ - protected GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - - // get the analysis name - public abstract String getAnalysisName(); - - /** - * Gets the GATK argument bundle. - * @return A structure consisting of whatever arguments should be used to initialize the GATK engine. - */ - protected abstract GATKArgumentCollection getArgumentCollection(); - - /** - * A list of all the arguments initially used as sources. - */ - private final Collection argumentSources = new ArrayList(); - - protected static Logger logger = Logger.getLogger(CommandLineExecutable.class); - - /** - * this is the function that the inheriting class can expect to have called - * when the command line system has initialized. - * - * @return the return code to exit the program with - */ - protected int execute() throws Exception { - engine.setParser(parser); - argumentSources.add(this); - - Walker walker = engine.getWalkerByName(getAnalysisName()); - - try { - // Make sure a valid GATK user key is present, if required. - authorizeGATKRun(); - - engine.setArguments(getArgumentCollection()); - - // File lists can require a bit of additional expansion. Set these explicitly by the engine. - final Collection bamFileList=ListFileUtils.unpackBAMFileList(getArgumentCollection().samFiles,parser); - engine.setSAMFileIDs(bamFileList); - if(getArgumentCollection().showFullBamList){ - logger.info(String.format("Adding the following input SAM Files: %s",bamFileList.toString())); - } - - engine.setWalker(walker); - walker.setToolkit(engine); - - Collection filters = engine.createFilters(); - engine.setFilters(filters); - - // load the arguments into the walker / filters. - // TODO: The fact that this extra load call exists here when all the parsing happens at the engine - // TODO: level indicates that we're doing something wrong. Turn this around so that the GATK can drive - // TODO: argument processing. - loadArgumentsIntoObject(walker); - argumentSources.add(walker); - - Collection rodBindings = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser); - engine.setReferenceMetaDataFiles(rodBindings); - - for (ReadFilter filter: filters) { - loadArgumentsIntoObject(filter); - argumentSources.add(filter); - } - - engine.execute(); - generateGATKRunReport(walker); - } catch ( Exception e ) { - generateGATKRunReport(walker, e); - throw e; - } - - // always return 0 - return 0; - } - - /** - * Authorizes this run of the GATK by checking for a valid GATK user key, if required. - * Currently, a key is required only if running with the -et NO_ET or -et STDOUT options. - */ - private void authorizeGATKRun() { - if ( getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.NO_ET || - getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.STDOUT ) { - if ( getArgumentCollection().gatkKeyFile == null ) { - throw new UserException("Running with the -et NO_ET or -et STDOUT option requires a GATK Key file. " + - "Please see " + UserException.PHONE_HOME_DOCS_URL + - " for more information and instructions on how to obtain a key."); - } - else { - PublicKey gatkPublicKey = CryptUtils.loadGATKDistributedPublicKey(); - GATKKey gatkUserKey = new GATKKey(gatkPublicKey, getArgumentCollection().gatkKeyFile); - - if ( ! gatkUserKey.isValid() ) { - throw new UserException.KeySignatureVerificationException(getArgumentCollection().gatkKeyFile); - } - } - } - } - - /** - * Generate the GATK run report for this walker using the current GATKEngine, if -et is enabled. - * This report will be written to either STDOUT or to the run repository, depending on the options - * for -et. - * - * @param e the exception, can be null if no exception occurred - */ - private void generateGATKRunReport(Walker walker, Exception e) { - if ( getArgumentCollection().phoneHomeType != GATKRunReport.PhoneHomeOption.NO_ET ) { - GATKRunReport report = new GATKRunReport(walker, e, engine, getArgumentCollection().phoneHomeType ); - report.postReport(getArgumentCollection().phoneHomeType); - } - } - - /** - * Convenience method for fully parameterized generateGATKRunReport when an exception has - * not occurred - * - * @param walker - */ - private void generateGATKRunReport(Walker walker) { - generateGATKRunReport(walker, null); - } - - /** - * Subclasses of CommandLinePrograms can provide their own types of command-line arguments. - * @return A collection of type descriptors generating implementation-dependent placeholders. - */ - protected Collection getArgumentTypeDescriptors() { - return Arrays.asList( new VCFWriterArgumentTypeDescriptor(engine,System.out,argumentSources), - new SAMFileWriterArgumentTypeDescriptor(engine,System.out), - new OutputStreamArgumentTypeDescriptor(engine,System.out) ); - } - - /** - * GATK can add arguments dynamically based on analysis type. - * - * @return true - */ - @Override - protected boolean canAddArgumentsDynamically() { - return true; - } - - /** - * GATK provides the walker as an argument source. - * @return List of walkers to load dynamically. - */ - @Override - protected Class[] getArgumentSources() { - // No walker info? No plugins. - if (getAnalysisName() == null) return new Class[] {}; - - Collection argumentSources = new ArrayList(); - - Walker walker = engine.getWalkerByName(getAnalysisName()); - engine.setArguments(getArgumentCollection()); - engine.setWalker(walker); - walker.setToolkit(engine); - argumentSources.add(walker.getClass()); - - Collection filters = engine.createFilters(); - for(ReadFilter filter: filters) - argumentSources.add(filter.getClass()); - - Class[] argumentSourcesAsArray = new Class[argumentSources.size()]; - return argumentSources.toArray(argumentSourcesAsArray); - } - - @Override - protected String getArgumentSourceName( Class argumentSource ) { - return engine.getWalkerName((Class)argumentSource); - } - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/CommandLineGATK.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/CommandLineGATK.java deleted file mode 100644 index f88c413bb..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/CommandLineGATK.java +++ /dev/null @@ -1,385 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine; - -import picard.PicardException; -import htsjdk.samtools.SAMException; -import htsjdk.tribble.TribbleException; -import org.broadinstitute.gatk.utils.commandline.Argument; -import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; -import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; -import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; -import org.broadinstitute.gatk.engine.refdata.tracks.FeatureManager; -import org.broadinstitute.gatk.engine.walkers.Attribution; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.help.*; -import org.broadinstitute.gatk.utils.text.TextFormattingUtils; - -import java.util.*; - -/** - * All command line parameters accepted by all tools in the GATK. - * - *

    Info for general users

    - * - *

    This is a list of options and parameters that are generally available to all tools in the GATK.

    - * - *

    There may be a few restrictions, which are indicated in individual argument descriptions. For example the -BQSR - * argument is only meant to be used with a subset of tools, and the -pedigree argument will only be effectively used - * by a subset of tools as well. Some arguments conflict with others, and some conversely are dependent on others. This - * is all indicated in the detailed argument descriptions, so be sure to read those in their entirety rather than just - * skimming the one-line summaey in the table.

    - * - *

    Info for developers

    - * - *

    This class is the GATK engine itself, which manages map/reduce data access and runs walkers.

    - * - *

    We run command line GATK programs using this class. It gets the command line args, parses them, and hands the - * gatk all the parsed out information. Pretty much anything dealing with the underlying system should go here; - * the GATK engine should deal with any data related information.

    - */ -@DocumentedGATKFeature(groupName = HelpConstants.DOCS_CAT_ENGINE) -public class CommandLineGATK extends CommandLineExecutable { - /** - * A complete list of tools (sometimes also called walkers because they "walk" through the data to perform analyses) - * is available in the online documentation. - */ - @Argument(fullName = "analysis_type", shortName = "T", doc = "Name of the tool to run") - private String analysisName = null; - - // our argument collection, the collection of command line args we accept - @ArgumentCollection - private GATKArgumentCollection argCollection = new GATKArgumentCollection(); - - /** - * Get pleasing info about the GATK. - * - * @return A list of Strings that contain pleasant info about the GATK. - */ - @Override - protected ApplicationDetails getApplicationDetails() { - return new ApplicationDetails(createApplicationHeader(), - getAttribution(), - ApplicationDetails.createDefaultRunningInstructions(getClass()), - getAdditionalHelp()); - } - - @Override - public String getAnalysisName() { - return analysisName; - } - - @Override - protected GATKArgumentCollection getArgumentCollection() { - return argCollection; - } - - /** - * Required main method implementation. - */ - public static void main(String[] argv) { - try { - CommandLineGATK instance = new CommandLineGATK(); - start(instance, argv); - System.exit(CommandLineProgram.result); // todo -- this is a painful hack - } catch (UserException e) { - exitSystemWithUserError(e); - } catch (TribbleException e) { - // We can generate Tribble Exceptions in weird places when e.g. VCF genotype fields are - // lazy loaded, so they aren't caught elsewhere and made into User Exceptions - exitSystemWithUserError(e); - } catch(PicardException e) { - // TODO: Should Picard exceptions be, in general, UserExceptions or ReviewedGATKExceptions? - exitSystemWithError(e); - } catch (SAMException e) { - checkForMaskedUserErrors(e); - exitSystemWithSamError(e); - } catch (OutOfMemoryError e) { - exitSystemWithUserError(new UserException.NotEnoughMemory()); - } catch (Throwable t) { - checkForMaskedUserErrors(t); - exitSystemWithError(t); - } - } - - public static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file"; - public static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files"; - public static final String NO_SPACE_LEFT_ON_DEVICE_ERROR = "No space left on device"; - public static final String DISK_QUOTA_EXCEEDED_ERROR = "Disk quota exceeded"; - - private static void checkForMaskedUserErrors(final Throwable t) { - // masked out of memory error - if ( t instanceof OutOfMemoryError ) - exitSystemWithUserError(new UserException.NotEnoughMemory()); - // masked user error - if ( t instanceof UserException || t instanceof TribbleException ) - exitSystemWithUserError(new UserException(t.getMessage())); - - // no message means no masked error - final String message = t.getMessage(); - if ( message == null ) - return; - - // too many open files error - if ( message.contains("Too many open files") ) - exitSystemWithUserError(new UserException.TooManyOpenFiles()); - - // malformed BAM looks like a SAM file - if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) || message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) ) - exitSystemWithSamError(t); - - // can't close tribble index when writing - if ( message.contains("Unable to close index for") ) - exitSystemWithUserError(new UserException(t.getCause() == null ? message : t.getCause().getMessage())); - - // disk is full - if ( message.contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || message.contains(DISK_QUOTA_EXCEEDED_ERROR) ) - exitSystemWithUserError(new UserException.NoSpaceOnDevice()); - - // masked error wrapped in another one - if ( t.getCause() != null ) - checkForMaskedUserErrors(t.getCause()); - } - - /** - * Creates the a short blurb about the GATK, copyright info, and where to get documentation. - * - * @return The application header. - */ - public static List createApplicationHeader() { - List header = new ArrayList(); - header.add(String.format("The Genome Analysis Toolkit (GATK) v%s, Compiled %s",getVersionNumber(), getBuildTime())); - header.add("Copyright (c) 2010 The Broad Institute"); - header.add("For support and documentation go to " + HelpConstants.BASE_GATK_URL); - return header; - } - - public static String getVersionNumber() { - ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("GATKText"); - return headerInfo.containsKey("org.broadinstitute.gatk.tools.version") ? headerInfo.getString("org.broadinstitute.gatk.tools.version") : ""; - } - - public static String getBuildTime() { - ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("GATKText"); - return headerInfo.containsKey("build.timestamp") ? headerInfo.getString("build.timestamp") : ""; - } - - /** - * If the user supplied any additional attribution, return it here. - * @return Additional attribution if supplied by the user. Empty (non-null) list otherwise. - */ - private List getAttribution() { - List attributionLines = new ArrayList(); - - // If no analysis name is present, fill in extra help on the walkers. - WalkerManager walkerManager = engine.getWalkerManager(); - String analysisName = getAnalysisName(); - if(analysisName != null && walkerManager.exists(analysisName)) { - Class walkerType = walkerManager.getWalkerClassByName(analysisName); - if(walkerType.isAnnotationPresent(Attribution.class)) - attributionLines.addAll(Arrays.asList(walkerType.getAnnotation(Attribution.class).value())); - } - return attributionLines; - } - - /** - * Retrieves additional information about GATK walkers. - * the code in HelpFormatter and supply it as a helper to this method. - * - * @return A string summarizing the walkers available in this distribution. - */ - private String getAdditionalHelp() { - String additionalHelp; - - // If no analysis name is present, fill in extra help on the walkers. - WalkerManager walkerManager = engine.getWalkerManager(); - String analysisName = getAnalysisName(); - if(analysisName != null && walkerManager.exists(getAnalysisName())) - additionalHelp = getWalkerHelp(walkerManager.getWalkerClassByName(getAnalysisName())); - else - additionalHelp = getAllWalkerHelp(); - - return additionalHelp; - } - - private static final int PACKAGE_INDENT = 1; - private static final int WALKER_INDENT = 3; - private static final String FIELD_SEPARATOR = " "; - - private String getWalkerHelp(Class walkerType) { - // Construct a help string to output details on this walker. - StringBuilder additionalHelp = new StringBuilder(); - Formatter formatter = new Formatter(additionalHelp); - - formatter.format("Available Reference Ordered Data types:%n"); - formatter.format(new FeatureManager().userFriendlyListOfAvailableFeatures()); - formatter.format("%n"); - - formatter.format("For a full description of this walker, see its GATKdocs at:%n"); - formatter.format("%s%n", GATKDocUtils.helpLinksToGATKDocs(walkerType)); - - return additionalHelp.toString(); - } - - /** - * Load in additional help information about all available walkers. - * @return A string representation of the additional help. - */ - private String getAllWalkerHelp() { - // Construct a help string to output available walkers. - StringBuilder additionalHelp = new StringBuilder(); - Formatter formatter = new Formatter(additionalHelp); - - // Get the list of walker names from the walker manager. - WalkerManager walkerManager = engine.getWalkerManager(); - - // Build a list sorted by walker display name. As this information is collected, keep track of the longest - // package / walker name for later formatting. - SortedSet helpText = new TreeSet(new HelpEntryComparator()); - - int longestPackageName = 0; - int longestWalkerName = 0; - for(Map.Entry>> walkersByPackage: walkerManager.getWalkerNamesByPackage(true).entrySet()) { - // Get the display name. - String packageName = walkersByPackage.getKey(); - String packageDisplayName = walkerManager.getPackageDisplayName(walkersByPackage.getKey()); - String packageHelpText = walkerManager.getPackageSummaryText(packageName); - - // Compute statistics about which names is longest. - longestPackageName = Math.max(longestPackageName,packageDisplayName.length()); - - SortedSet walkersInPackage = new TreeSet(new HelpEntryComparator()); - for(Class walkerType: walkersByPackage.getValue()) { - String walkerName = walkerType.getName(); - String walkerDisplayName = walkerManager.getName(walkerType); - String walkerHelpText = walkerManager.getWalkerSummaryText(walkerType); - - longestWalkerName = Math.max(longestWalkerName,walkerManager.getName(walkerType).length()); - - walkersInPackage.add(new HelpEntry(walkerName,walkerDisplayName,walkerHelpText)); - } - - // Dump the walkers into the sorted set. - helpText.add(new HelpEntry(packageName,packageDisplayName,packageHelpText,Collections.unmodifiableSortedSet(walkersInPackage))); - } - - final int headerWidth = Math.max(longestPackageName+PACKAGE_INDENT,longestWalkerName+WALKER_INDENT); - - - for(HelpEntry packageHelp: helpText) { - printDescriptorLine(formatter,PACKAGE_INDENT,packageHelp.displayName,headerWidth,FIELD_SEPARATOR,packageHelp.summary,TextFormattingUtils.DEFAULT_LINE_WIDTH); - - for(HelpEntry walkerHelp: packageHelp.children) - printDescriptorLine(formatter,WALKER_INDENT,walkerHelp.displayName,headerWidth,FIELD_SEPARATOR,walkerHelp.summary,TextFormattingUtils.DEFAULT_LINE_WIDTH); - - // Print a blank line between sets of walkers. - printDescriptorLine(formatter,0,"",headerWidth,FIELD_SEPARATOR,"", TextFormattingUtils.DEFAULT_LINE_WIDTH); - } - - return additionalHelp.toString(); - } - - private void printDescriptorLine(Formatter formatter, - int headerIndentWidth, - String header, - int headerWidth, - String fieldSeparator, - String description, - int lineWidth) { - final int headerPaddingWidth = headerWidth - header.length() - headerIndentWidth; - final int descriptionWidth = lineWidth - fieldSeparator.length() - headerWidth; - List wordWrappedText = TextFormattingUtils.wordWrap(description,descriptionWidth); - - String headerIndentFormatString = headerIndentWidth > 0 ? "%" + headerIndentWidth + "s" : "%s"; - String headerPaddingFormatString = headerPaddingWidth > 0 ? "%" + headerPaddingWidth + "s" : "%s"; - String headerWidthFormatString = headerWidth > 0 ? "%" + headerWidth + "s" : "%s"; - - // Output description line. - formatter.format(headerIndentFormatString + "%s" + headerPaddingFormatString + "%s%s%n", - "", header, "", fieldSeparator, wordWrappedText.size()>0?wordWrappedText.get(0):""); - for(int i = 1; i < wordWrappedText.size(); i++) - formatter.format(headerWidthFormatString + "%s%s%n", "", fieldSeparator, wordWrappedText.get(i)); - } - -} - -/** - * Represents a given help entry; contains a display name, a summary and optionally some children. - */ -class HelpEntry { - public final String uid; - public final String displayName; - public final String summary; - public final SortedSet children; - - /** - * Create a new help entry with the given display name, summary and children. - * @param uid a unique identifier. Usually, the java package. - * @param displayName display name for this help entry. - * @param summary summary for this help entry. - * @param children children for this help entry. - */ - public HelpEntry(String uid, String displayName, String summary, SortedSet children) { - this.uid = uid; - this.displayName = displayName; - this.summary = summary; - this.children = children; - } - - /** - * Create a new help entry with the given display name, summary and children. - * @param uid a unique identifier. Usually, the java package. - * @param displayName display name for this help entry. - * @param summary summary for this help entry. - */ - public HelpEntry(String uid, String displayName, String summary) { - this(uid,displayName,summary,null); - } - -} - -/** - * Compare two help entries by display name. - */ -class HelpEntryComparator implements Comparator { - private static TextFormattingUtils.CaseInsensitiveComparator textComparator = new TextFormattingUtils.CaseInsensitiveComparator(); - - /** - * Compares the order of lhs to rhs, not taking case into account. - * @param lhs First object to compare. - * @param rhs Second object to compare. - * @return 0 if objects are identical; -1 if lhs is before rhs, 1 if rhs is before lhs. Nulls are treated as after everything else. - */ - public int compare(HelpEntry lhs, HelpEntry rhs) { - if(lhs == null && rhs == null) return 0; - if(lhs == null || lhs.displayName.equals("")) return 1; - if(rhs == null || rhs.displayName.equals("")) return -1; - return lhs.displayName.equals(rhs.displayName) ? textComparator.compare(lhs.uid,rhs.uid) : textComparator.compare(lhs.displayName,rhs.displayName); - } - - -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java deleted file mode 100644 index abb699301..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java +++ /dev/null @@ -1,1280 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine; - -import com.google.java.contract.Ensures; -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import htsjdk.samtools.reference.ReferenceSequenceFile; -import htsjdk.variant.vcf.VCFConstants; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; -import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; -import org.broadinstitute.gatk.engine.datasources.reads.*; -import org.broadinstitute.gatk.engine.datasources.reference.ReferenceDataSource; -import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.downsampling.DownsamplingMethod; -import org.broadinstitute.gatk.engine.executive.MicroScheduler; -import org.broadinstitute.gatk.engine.filters.FilterManager; -import org.broadinstitute.gatk.engine.filters.ReadFilter; -import org.broadinstitute.gatk.engine.filters.ReadGroupBlackListFilter; -import org.broadinstitute.gatk.engine.io.OutputTracker; -import org.broadinstitute.gatk.engine.io.stubs.SAMFileWriterStub; -import org.broadinstitute.gatk.engine.io.stubs.Stub; -import org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub; -import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.iterators.ReadTransformersMode; -import org.broadinstitute.gatk.engine.phonehome.GATKRunReport; -import org.broadinstitute.gatk.engine.refdata.tracks.IndexDictionaryUtils; -import org.broadinstitute.gatk.engine.refdata.tracks.RMDTrackBuilder; -import org.broadinstitute.gatk.engine.refdata.utils.RMDTriplet; -import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; -import org.broadinstitute.gatk.engine.samples.SampleDB; -import org.broadinstitute.gatk.engine.samples.SampleDBBuilder; -import org.broadinstitute.gatk.engine.walkers.*; -import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedSampleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList; -import org.broadinstitute.gatk.utils.*; -import org.broadinstitute.gatk.utils.classloader.PluginManager; -import org.broadinstitute.gatk.utils.commandline.*; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.interval.IntervalUtils; -import org.broadinstitute.gatk.utils.progressmeter.ProgressMeter; -import org.broadinstitute.gatk.utils.recalibration.BQSRArgumentSet; -import org.broadinstitute.gatk.utils.sam.ReadUtils; -import org.broadinstitute.gatk.utils.text.XReadLines; -import org.broadinstitute.gatk.utils.threading.ThreadEfficiencyMonitor; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.*; -import java.util.concurrent.TimeUnit; - -import static org.broadinstitute.gatk.utils.DeprecatedToolChecks.getWalkerDeprecationInfo; -import static org.broadinstitute.gatk.utils.DeprecatedToolChecks.isDeprecatedWalker; - -/** - * A GenomeAnalysisEngine that runs a specified walker. - */ -public class GenomeAnalysisEngine { - /** - * our log, which we want to capture anything from this class - */ - private static Logger logger = Logger.getLogger(GenomeAnalysisEngine.class); - public static final long NO_RUNTIME_LIMIT = -1; - - /** - * The GATK command-line argument parsing code. - */ - private ParsingEngine parsingEngine; - - /** - * The genomeLocParser can create and parse GenomeLocs. - */ - private GenomeLocParser genomeLocParser; - - /** - * Accessor for sharded read data. - */ - private SAMDataSource readsDataSource = null; - - /** - * Accessor for sharded reference data. - */ - private ReferenceDataSource referenceDataSource = null; - - /** - * Accessor for sample metadata - */ - private SampleDB sampleDB = new SampleDB(); - - /** - * Accessor for sharded reference-ordered data. - */ - private List rodDataSources; - - // our argument collection - private GATKArgumentCollection argCollection; - - /** - * Collection of intervals used by the engine. - */ - private GenomeLocSortedSet intervals = null; - - /** - * Explicitly assign the interval set to use for this traversal (for unit testing purposes) - * @param intervals set of intervals to use for this traversal - */ - public void setIntervals( GenomeLocSortedSet intervals ) { - this.intervals = intervals; - } - - /** - * Collection of inputs used by the engine. - */ - private Map inputs = new HashMap(); - - /** - * Collection of outputs used by the engine. - */ - private Collection> outputs = new ArrayList>(); - - /** - * Collection of the filters applied to the input data. - */ - private Collection filters; - - /** - * Collection of the read transformers applied to the reads - */ - private List readTransformers; - - /** - * Controls the allocation of threads between CPU vs IO. - */ - private ThreadAllocation threadAllocation; - - private ReadMetrics cumulativeMetrics = null; - - /** - * A currently hacky unique name for this GATK instance - */ - private String myName = "GATK_" + Math.abs(getRandomGenerator().nextInt()); - - /** - * our walker manager - */ - private final WalkerManager walkerManager = new WalkerManager(); - - private Walker walker; - - public void setWalker(Walker walker) { - this.walker = walker; - } - - /** - * The short name of the current GATK walker as a string - * @return a non-null String - */ - public String getWalkerName() { - return getWalkerName(walker.getClass()); - } - - /** - * A processed collection of SAM reader identifiers. - */ - private Collection samReaderIDs = Collections.emptyList(); - - /** - * Set the SAM/BAM files over which to traverse. - * @param samReaderIDs Collection of ids to use during this traversal. - */ - public void setSAMFileIDs(Collection samReaderIDs) { - this.samReaderIDs = samReaderIDs; - } - - /** - * Collection of reference metadata files over which to traverse. - */ - private Collection referenceMetaDataFiles; - - /** - * The threading efficiency monitor we use in the GATK to monitor our efficiency. - * - * May be null if one isn't active, or hasn't be initialized yet - */ - private ThreadEfficiencyMonitor threadEfficiencyMonitor = null; - - /** - * The global progress meter we are using to track our progress through the genome - */ - private ProgressMeter progressMeter = null; - - /** - * Set the reference metadata files to use for this traversal. - * @param referenceMetaDataFiles Collection of files and descriptors over which to traverse. - */ - public void setReferenceMetaDataFiles(Collection referenceMetaDataFiles) { - this.referenceMetaDataFiles = referenceMetaDataFiles; - } - - /** - * The maximum runtime of this engine, in nanoseconds, set during engine initialization - * from the GATKArgumentCollection command line value - */ - private long runtimeLimitInNanoseconds = -1; - - /** - * Static random number generator and seed. - */ - private static final long GATK_RANDOM_SEED = 47382911L; - private static Random randomGenerator = new Random(GATK_RANDOM_SEED); - public static Random getRandomGenerator() { return randomGenerator; } - public static void resetRandomGenerator() { randomGenerator.setSeed(GATK_RANDOM_SEED); } - public static void resetRandomGenerator(long seed) { randomGenerator.setSeed(seed); } - - /** - * Base Quality Score Recalibration helper object - */ - private BQSRArgumentSet bqsrArgumentSet = null; - public BQSRArgumentSet getBQSRArgumentSet() { return bqsrArgumentSet; } - public boolean hasBQSRArgumentSet() { return bqsrArgumentSet != null; } - public void setBaseRecalibration(final GATKArgumentCollection args) { - bqsrArgumentSet = new BQSRArgumentSet(args); - } - - /** - * Actually run the GATK with the specified walker. - * - * @return the value of this traversal. - */ - public Object execute() { - // first thing is to make sure the AWS keys can be decrypted - GATKRunReport.checkAWSAreValid(); - - //HeapSizeMonitor monitor = new HeapSizeMonitor(); - //monitor.start(); - setStartTime(new java.util.Date()); - - final GATKArgumentCollection args = this.getArguments(); - - // validate our parameters - if (args == null) { - throw new ReviewedGATKException("The GATKArgumentCollection passed to GenomeAnalysisEngine can not be null."); - } - - // validate our parameters - if (this.walker == null) - throw new ReviewedGATKException("The walker passed to GenomeAnalysisEngine can not be null."); - - if (args.nonDeterministicRandomSeed) - resetRandomGenerator(System.currentTimeMillis()); - - // if the use specified an input BQSR recalibration table then enable on the fly recalibration - if (args.BQSR_RECAL_FILE != null) - setBaseRecalibration(args); - - // setup the runtime limits - setupRuntimeLimits(args); - - // Determine how the threads should be divided between CPU vs. IO. - determineThreadAllocation(); - - // Prepare the data for traversal. - initializeDataSources(); - - // initialize and validate the interval list - initializeIntervals(); - validateSuppliedIntervals(); - - // check to make sure that all sequence dictionaries are compatible with the reference's sequence dictionary - validateDataSourcesAgainstReference(readsDataSource, referenceDataSource.getReference(), rodDataSources); - - // initialize sampleDB - initializeSampleDB(); - - // our microscheduler, which is in charge of running everything - MicroScheduler microScheduler = createMicroscheduler(); - threadEfficiencyMonitor = microScheduler.getThreadEfficiencyMonitor(); - - // create temp directories as necessary - initializeTempDirectory(); - - // create the output streams - initializeOutputStreams(microScheduler.getOutputTracker()); - - // Initializing the shard iterator / BAM schedule might take some time, so let the user know vaguely what's going on - logger.info("Preparing for traversal" + - (readsDataSource.getReaderIDs().size() > 0 ? String.format(" over %d BAM files", readsDataSource.getReaderIDs().size()) : "")); - Iterable shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals); - logger.info("Done preparing for traversal"); - - // execute the microscheduler, storing the results - return microScheduler.execute(this.walker, shardStrategy); - - //monitor.stop(); - //logger.info(String.format("Maximum heap size consumed: %d",monitor.getMaxMemoryUsed())); - - //return result; - } - - /** - * Retrieves an instance of the walker based on the walker name. - * - * @param walkerName Name of the walker. Must not be null. If the walker cannot be instantiated, an exception will be thrown. - * @return An instance of the walker. - */ - public Walker getWalkerByName(String walkerName) { - try { - return walkerManager.createByName(walkerName); - } catch ( UserException e ) { - if ( isDeprecatedWalker(walkerName) ) { - e = new UserException.DeprecatedWalker(walkerName, getWalkerDeprecationInfo(walkerName)); - } - throw e; - } - } - - /** - * Gets the name of a given walker type. - * @param walkerType Type of walker. - * @return Name of the walker. - */ - public String getWalkerName(Class walkerType) { - return walkerManager.getName(walkerType); - } - - public String getName() { - return myName; - } - - /** - * Gets a list of the filters to associate with the given walker. Will NOT initialize the engine with this filters; - * the caller must handle that directly. - * @return A collection of available filters. - */ - public Collection createFilters() { - final List filters = new LinkedList<>(); - - // First add the user requested filters - if (this.getArguments().readGroupBlackList != null && this.getArguments().readGroupBlackList.size() > 0) - filters.add(new ReadGroupBlackListFilter(this.getArguments().readGroupBlackList)); - for(final String filterName: this.getArguments().readFilters) - filters.add(this.getFilterManager().createByName(filterName)); - - // now add the walker default filters. This ordering is critical important if - // users need to apply filters that fix up reads that would be removed by default walker filters - filters.addAll(WalkerManager.getReadFilters(walker,this.getFilterManager())); - - return Collections.unmodifiableList(filters); - } - - /** - * Returns a list of active, initialized read transformers - * - * @param walker the walker we need to apply read transformers too - */ - public void initializeReadTransformers(final Walker walker) { - // keep a list of the active read transformers sorted based on priority ordering - List activeTransformers = new ArrayList(); - - final ReadTransformersMode overrideMode = WalkerManager.getWalkerAnnotation(walker, ReadTransformersMode.class); - final ReadTransformer.ApplicationTime overrideTime = overrideMode != null ? overrideMode.ApplicationTime() : null; - - final PluginManager pluginManager = new PluginManager(ReadTransformer.class); - - for ( final ReadTransformer transformer : pluginManager.createAllTypes() ) { - transformer.initialize(overrideTime, this, walker); - if ( transformer.enabled() ) - activeTransformers.add(transformer); - } - - setReadTransformers(activeTransformers); - } - - public List getReadTransformers() { - return readTransformers; - } - - /* - * Sanity checks that incompatible read transformers are not active together (and throws an exception if they are). - * - * @param readTransformers the active read transformers - */ - protected void checkActiveReadTransformers(final List readTransformers) { - if ( readTransformers == null ) - throw new IllegalArgumentException("read transformers cannot be null"); - - ReadTransformer sawMustBeFirst = null; - ReadTransformer sawMustBeLast = null; - - for ( final ReadTransformer r : readTransformers ) { - if ( r.getOrderingConstraint() == ReadTransformer.OrderingConstraint.MUST_BE_FIRST ) { - if ( sawMustBeFirst != null ) - throw new UserException.IncompatibleReadFiltersException(sawMustBeFirst.toString(), r.toString()); - sawMustBeFirst = r; - } else if ( r.getOrderingConstraint() == ReadTransformer.OrderingConstraint.MUST_BE_LAST ) { - if ( sawMustBeLast != null ) - throw new UserException.IncompatibleReadFiltersException(sawMustBeLast.toString(), r.toString()); - sawMustBeLast = r; - } - } - } - - protected void setReadTransformers(final List readTransformers) { - if ( readTransformers == null ) - throw new ReviewedGATKException("read transformers cannot be null"); - - // sort them in priority order - Collections.sort(readTransformers, new ReadTransformer.ReadTransformerComparator()); - - // make sure we don't have an invalid set of active read transformers - checkActiveReadTransformers(readTransformers); - - this.readTransformers = readTransformers; - } - - /** - * Parse out the thread allocation from the given command-line argument. - */ - private void determineThreadAllocation() { - if ( argCollection.numberOfDataThreads < 1 ) throw new UserException.BadArgumentValue("num_threads", "cannot be less than 1, but saw " + argCollection.numberOfDataThreads); - if ( argCollection.numberOfCPUThreadsPerDataThread < 1 ) throw new UserException.BadArgumentValue("num_cpu_threads", "cannot be less than 1, but saw " + argCollection.numberOfCPUThreadsPerDataThread); - if ( argCollection.numberOfIOThreads < 0 ) throw new UserException.BadArgumentValue("num_io_threads", "cannot be less than 0, but saw " + argCollection.numberOfIOThreads); - - this.threadAllocation = new ThreadAllocation(argCollection.numberOfDataThreads, - argCollection.numberOfCPUThreadsPerDataThread, - argCollection.numberOfIOThreads, - argCollection.monitorThreadEfficiency); - } - - public int getTotalNumberOfThreads() { - return this.threadAllocation == null ? 1 : threadAllocation.getTotalNumThreads(); - } - - - - /** - * Allow subclasses and others within this package direct access to the walker manager. - * @return The walker manager used by this package. - */ - protected WalkerManager getWalkerManager() { - return walkerManager; - } - - /** - * setup a microscheduler - * - * @return a new microscheduler - */ - private MicroScheduler createMicroscheduler() { - // Temporarily require all walkers to have a reference, even if that reference is not conceptually necessary. - if ((walker instanceof ReadWalker || walker instanceof DuplicateWalker || walker instanceof ReadPairWalker) && - this.getArguments().referenceFile == null) { - throw new UserException.CommandLineException("Read-based traversals require a reference file but none was given"); - } - - return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),threadAllocation); - } - - protected DownsamplingMethod getDownsamplingMethod() { - GATKArgumentCollection argCollection = this.getArguments(); - - DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod(); - DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker); - - DownsamplingMethod method = commandLineMethod != null ? commandLineMethod : walkerMethod; - method.checkCompatibilityWithWalker(walker); - return method; - } - - protected void setDownsamplingMethod(DownsamplingMethod method) { - argCollection.setDownsamplingMethod(method); - } - - protected boolean includeReadsWithDeletionAtLoci() { - return walker.includeReadsWithDeletionAtLoci(); - } - - /** - * Verifies that the supplied set of reads files mesh with what the walker says it requires; - * also makes sure that list of SAM files specified on the command line is not empty and contains - * no duplicates. - */ - protected void validateSuppliedReads() { - GATKArgumentCollection arguments = this.getArguments(); - final Boolean samFilesArePresent = (arguments.samFiles != null && !arguments.samFiles.isEmpty()); - - // Check what the walker says is required against what was provided on the command line. - if (WalkerManager.isRequired(walker, DataSource.READS) && !samFilesArePresent) - throw new ArgumentException("Walker requires reads but none were provided."); - - // Check what the walker says is allowed against what was provided on the command line. - if (samFilesArePresent && !WalkerManager.isAllowed(walker, DataSource.READS)) - throw new ArgumentException("Walker does not allow reads but reads were provided."); - - //Make sure SAM list specified by the user (if necessary) is not empty - if(WalkerManager.isRequired(walker, DataSource.READS) && samFilesArePresent && samReaderIDs.isEmpty() ) { - throw new UserException("The list of input files does not contain any BAM files."); - } - - // Make sure no SAM files were specified multiple times by the user. - checkForDuplicateSamFiles(); - } - - /** - * Checks whether there are SAM files that appear multiple times in the fully unpacked list of - * SAM files (samReaderIDs). If there are, throws an ArgumentException listing the files in question. - */ - protected void checkForDuplicateSamFiles() { - Set encounteredSamFiles = new HashSet(); - Set duplicateSamFiles = new LinkedHashSet(); - - for ( SAMReaderID samFile : samReaderIDs ) { - if ( encounteredSamFiles.contains(samFile) ) { - duplicateSamFiles.add(samFile.getSamFilePath()); - } - else { - encounteredSamFiles.add(samFile); - } - } - - if ( duplicateSamFiles.size() > 0 ) { - throw new UserException("The following BAM files appear multiple times in the list of input files: " + - duplicateSamFiles + " BAM files may be specified at most once."); - } - - } - - /** - * Verifies that the supplied reference file mesh with what the walker says it requires. - */ - protected void validateSuppliedReference() { - GATKArgumentCollection arguments = this.getArguments(); - // Check what the walker says is required against what was provided on the command line. - // TODO: Temporarily disabling WalkerManager.isRequired check on the reference because the reference is always required. - if (/*WalkerManager.isRequired(walker, DataSource.REFERENCE) &&*/ arguments.referenceFile == null) - throw new ArgumentException("Walker requires a reference but none was provided."); - - // Check what the walker says is allowed against what was provided on the command line. - if (arguments.referenceFile != null && !WalkerManager.isAllowed(walker, DataSource.REFERENCE)) - throw new ArgumentException("Walker does not allow a reference but one was provided."); - } - - protected void validateSuppliedIntervals() { - // Only read walkers support '-L unmapped' intervals. Trap and validate any other instances of -L unmapped. - if(!(walker instanceof ReadWalker)) { - GenomeLocSortedSet intervals = getIntervals(); - if(intervals != null && getIntervals().contains(GenomeLoc.UNMAPPED)) - throw new ArgumentException("Interval list specifies unmapped region. Only read walkers may include the unmapped region."); - } - - // If intervals is non-null and empty at this point, it means that the list of intervals to process - // was filtered down to an empty set (eg., the user specified something like -L chr1 -XL chr1). Since - // this was very likely unintentional, the user should be informed of this. Note that this is different - // from the case where intervals == null, which indicates that there were no interval arguments. - if ( intervals != null && intervals.isEmpty() ) { - logger.warn("The given combination of -L and -XL options results in an empty set. No intervals to process."); - } - - // TODO: add a check for ActiveRegion walkers to prevent users from passing an entire contig/chromosome - } - - /** - * Get the sharding strategy given a driving data source. - * - * @param readsDataSource readsDataSource - * @param drivingDataSource Data on which to shard. - * @param intervals intervals - * @return the sharding strategy - */ - protected Iterable getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) { - ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null); - DownsamplingMethod downsamplingMethod = readsDataSource != null ? readsDataSource.getReadsInfo().getDownsamplingMethod() : null; - ReferenceDataSource referenceDataSource = this.getReferenceDataSource(); - - // If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition. - if(!readsDataSource.isEmpty()) { - if(!readsDataSource.hasIndex() && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM)) - throw new UserException.CommandLineException("Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported."); - if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM) - throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available."); - - if(walker instanceof LocusWalker) { - if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); - if(intervals == null) - return readsDataSource.createShardIteratorOverMappedReads(new LocusShardBalancer()); - else - return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer()); - } - else if(walker instanceof ActiveRegionWalker) { - if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); - if(intervals == null) - return readsDataSource.createShardIteratorOverMappedReads(new ActiveRegionShardBalancer()); - else - return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new ActiveRegionShardBalancer()); - } - else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) { - // Apply special validation to read pair walkers. - if(walker instanceof ReadPairWalker) { - if(readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname) - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker."); - if(intervals != null && !intervals.isEmpty()) - throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); - } - - if(intervals == null) - return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); - else - return readsDataSource.createShardIteratorOverIntervals(intervals, new ReadShardBalancer()); - } - else - throw new ReviewedGATKException("Unable to determine walker type for walker " + walker.getClass().getName()); - } - else { - // TODO -- Determine what the ideal shard size should be here. Matt suggested that a multiple of 16K might work well - // TODO -- (because of how VCF indexes work), but my empirical experience has been simply that the larger the shard - // TODO -- size the more efficient the traversal (at least for RODWalkers). Keeping the previous values for now. [EB] - final int SHARD_SIZE = walker instanceof RodWalker ? 1000000 : 100000; - if(intervals == null) - return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE); - else - return referenceDataSource.createShardsOverIntervals(readsDataSource,intervals,SHARD_SIZE); - } - } - - protected boolean flashbackData() { - return walker instanceof ReadWalker; - } - - /** - * Create the temp directory if it doesn't exist. - */ - private void initializeTempDirectory() { - File tempDir = new File(System.getProperty("java.io.tmpdir")); - if (!tempDir.exists() && !tempDir.mkdirs()) - throw new UserException.BadTmpDir("Unable to create directory"); - } - - /** - * Initialize the output streams as specified by the user. - * - * @param outputTracker the tracker supplying the initialization data. - */ - private void initializeOutputStreams(final OutputTracker outputTracker) { - for (final Map.Entry input : getInputs().entrySet()) - outputTracker.addInput(input.getKey(), input.getValue()); - for (final Stub stub : getOutputs()) { - stub.processArguments(argCollection); - outputTracker.addOutput(stub); - } - - outputTracker.prepareWalker(walker, getArguments().strictnessLevel); - } - - public ReferenceDataSource getReferenceDataSource() { - return referenceDataSource; - } - - public GenomeLocParser getGenomeLocParser() { - return genomeLocParser; - } - - /** - * Manage lists of filters. - */ - private final FilterManager filterManager = new FilterManager(); - - private Date startTime = null; // the start time for execution - - public void setParser(ParsingEngine parsingEngine) { - this.parsingEngine = parsingEngine; - } - - /** - * Explicitly set the GenomeLocParser, for unit testing. - * @param genomeLocParser GenomeLocParser to use. - */ - public void setGenomeLocParser(GenomeLocParser genomeLocParser) { - this.genomeLocParser = genomeLocParser; - } - - /** - * Sets the start time when the execute() function was last called - * @param startTime the start time when the execute() function was last called - */ - protected void setStartTime(Date startTime) { - this.startTime = startTime; - } - - /** - * @return the start time when the execute() function was last called - */ - public Date getStartTime() { - return startTime; - } - - /** - * Setup the intervals to be processed - */ - protected void initializeIntervals() { - intervals = IntervalUtils.parseIntervalArguments(this.referenceDataSource, argCollection.intervalArguments); - } - - /** - * Add additional, externally managed IO streams for inputs. - * - * @param argumentSource Field into which to inject the value. - * @param value Instance to inject. - */ - public void addInput(ArgumentSource argumentSource, Object value) { - inputs.put(argumentSource, value); - } - - /** - * Add additional, externally managed IO streams for output. - * - * @param stub Instance to inject. - */ - public void addOutput(Stub stub) { - outputs.add(stub); - } - - /** - * Returns the tag associated with a given command-line argument. - * @param key Object for which to inspect the tag. - * @return Tags object associated with the given key, or an empty Tag structure if none are present. - */ - public Tags getTags(Object key) { - return parsingEngine.getTags(key); - } - - protected void initializeDataSources() { - logger.info("Strictness is " + argCollection.strictnessLevel); - - validateSuppliedReference(); - setReferenceDataSource(argCollection.referenceFile); - - validateSuppliedReads(); - initializeReadTransformers(walker); - - final Map sampleRenameMap = argCollection.sampleRenameMappingFile != null ? - loadSampleRenameMap(argCollection.sampleRenameMappingFile) : - null; - - readsDataSource = createReadsDataSource(argCollection,genomeLocParser,referenceDataSource.getReference(), sampleRenameMap); - - for (ReadFilter filter : filters) - filter.initialize(this); - - // set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference - rodDataSources = getReferenceOrderedDataSources(referenceMetaDataFiles,referenceDataSource.getReference().getSequenceDictionary(), - genomeLocParser,argCollection.unsafe,sampleRenameMap); - } - - /** - * Purely for testing purposes. Do not use unless you absolutely positively know what you are doing (or - * need to absolutely positively kill everyone in the room) - * @param dataSource - */ - public void setReadsDataSource(final SAMDataSource dataSource) { - this.readsDataSource = dataSource; - } - - /** - * Entry-point function to initialize the samples database from input data and pedigree arguments - */ - private void initializeSampleDB() { - SampleDBBuilder sampleDBBuilder = new SampleDBBuilder(this, argCollection.pedigreeValidationType); - sampleDBBuilder.addSamplesFromSAMHeader(getSAMFileHeader()); - sampleDBBuilder.addSamplesFromSampleNames(SampleUtils.getUniqueSamplesFromRods(this)); - sampleDBBuilder.addSamplesFromPedigreeFiles(argCollection.pedigreeFiles); - sampleDBBuilder.addSamplesFromPedigreeStrings(argCollection.pedigreeStrings); - sampleDB = sampleDBBuilder.getFinalSampleDB(); - } - - /** - * Gets a unique identifier for the reader sourcing this read. - * @param read Read to examine. - * @return A unique identifier for the source file of this read. Exception if not found. - */ - public SAMReaderID getReaderIDForRead(final SAMRecord read) { - return getReadsDataSource().getReaderID(read); - } - - /** - * Gets the source file for this read. - * @param id Unique identifier determining which input file to use. - * @return The source filename for this read. - */ - public File getSourceFileForReaderID(final SAMReaderID id) { - return getReadsDataSource().getSAMFile(id); - } - - /** - * Now that all files are open, validate the sequence dictionaries of the reads vs. the reference vrs the reference ordered data (if available). - * - * @param reads Reads data source. - * @param reference Reference data source. - * @param rods a collection of the reference ordered data tracks - */ - private void validateDataSourcesAgainstReference(SAMDataSource reads, ReferenceSequenceFile reference, Collection rods) { - if ((reads.isEmpty() && (rods == null || rods.isEmpty())) || reference == null ) - return; - - // Compile a set of sequence names that exist in the reference file. - SAMSequenceDictionary referenceDictionary = reference.getSequenceDictionary(); - - if (!reads.isEmpty()) { - // Compile a set of sequence names that exist in the BAM files. - SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary(); - - if (readsDictionary.size() == 0) { - logger.info("Reads file is unmapped. Skipping validation against reference."); - return; - } - - // compare the reads to the reference - SequenceDictionaryUtils.validateDictionaries(logger, getArguments().unsafe, "reads", readsDictionary, - "reference", referenceDictionary, true, intervals); - } - - for (ReferenceOrderedDataSource rod : rods) - IndexDictionaryUtils.validateTrackSequenceDictionary(rod.getName(), rod.getSequenceDictionary(), referenceDictionary, getArguments().unsafe); - } - - /** - * Gets a data source for the given set of reads. - * - * @param argCollection arguments - * @param genomeLocParser parser - * @param refReader reader - * @return A data source for the given set of reads. - */ - private SAMDataSource createReadsDataSource(final GATKArgumentCollection argCollection, final GenomeLocParser genomeLocParser, - final IndexedFastaSequenceFile refReader, final Map sampleRenameMap) { - DownsamplingMethod downsamplingMethod = getDownsamplingMethod(); - - // Synchronize the method back into the collection so that it shows up when - // interrogating for the downsampling method during command line recreation. - setDownsamplingMethod(downsamplingMethod); - - logger.info(downsamplingMethod); - - if (argCollection.removeProgramRecords && argCollection.keepProgramRecords) - throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options"); - - boolean removeProgramRecords = argCollection.removeProgramRecords || walker.getClass().isAnnotationPresent(RemoveProgramRecords.class); - - if (argCollection.keepProgramRecords) - removeProgramRecords = false; - - final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker; - - return new SAMDataSource( - samReaderIDs, - threadAllocation, - argCollection.numberOfBAMFileHandles, - genomeLocParser, - argCollection.useOriginalBaseQualities, - argCollection.strictnessLevel, - argCollection.readBufferSize, - downsamplingMethod, - new ValidationExclusion(Arrays.asList(argCollection.unsafe)), - filters, - readTransformers, - includeReadsWithDeletionAtLoci(), - argCollection.defaultBaseQualities, - removeProgramRecords, - keepReadsInLIBS, - sampleRenameMap, - argCollection.intervalArguments.intervalMerging); - } - - /** - * Loads a user-provided sample rename map file for use in on-the-fly sample renaming into an in-memory - * HashMap. This file must consist of lines with two whitespace-separated fields, the second of which - * may contain whitespace: - * - * absolute_path_to_file new_sample_name - * - * The engine will verify that each file contains data from only one sample when the on-the-fly sample - * renaming feature is being used. Note that this feature works only with bam and vcf files. - * - * @param sampleRenameMapFile sample rename map file from which to load data - * @return a HashMap containing the contents of the map file, with the keys being the input file paths and - * the values being the new sample names. - */ - protected Map loadSampleRenameMap( final File sampleRenameMapFile ) { - logger.info("Renaming samples from input files on-the-fly using mapping file " + sampleRenameMapFile.getAbsolutePath()); - - final Map sampleRenameMap = new HashMap<>((int)sampleRenameMapFile.length() / 50); - - try { - for ( final String line : new XReadLines(sampleRenameMapFile) ) { - final String[] tokens = line.split("\\s+", 2); - - if ( tokens.length != 2 ) { - throw new UserException.MalformedFile(sampleRenameMapFile, - String.format("Encountered a line with %s fields instead of the required 2 fields. Line was: %s", - tokens.length, line)); - } - - final File inputFile = new File(tokens[0]); - final String newSampleName = tokens[1].trim(); - - if (newSampleName.contains(VCFConstants.FIELD_SEPARATOR)) { - throw new UserException.MalformedFile(sampleRenameMapFile, String.format( - "Encountered illegal sample name; sample names may not include the VCF field delimiter (%s). Sample name: %s; line: %s", - VCFConstants.FIELD_SEPARATOR, - newSampleName, - line - )); - } - - if ( ! inputFile.isAbsolute() ) { - throw new UserException.MalformedFile(sampleRenameMapFile, "Input file path not absolute at line: " + line); - } - - final String inputFilePath = inputFile.getAbsolutePath(); - - if ( sampleRenameMap.containsKey(inputFilePath) ) { - throw new UserException.MalformedFile(sampleRenameMapFile, - String.format("Input file %s appears more than once", inputFilePath)); - } - - sampleRenameMap.put(inputFilePath, newSampleName); - } - } - catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(sampleRenameMapFile, e); - } - - return sampleRenameMap; - } - - - /** - * Opens a reference sequence file paired with an index. Only public for testing purposes - * - * @param refFile Handle to a reference sequence file. Non-null. - */ - public void setReferenceDataSource(File refFile) { - this.referenceDataSource = new ReferenceDataSource(refFile); - genomeLocParser = new GenomeLocParser(referenceDataSource.getReference()); - } - - /** - * Open the reference-ordered data sources. - * - * @param referenceMetaDataFiles collection of RMD descriptors to load and validate. - * @param sequenceDictionary GATK-wide sequnce dictionary to use for validation. - * @param genomeLocParser to use when creating and validating GenomeLocs. - * @param validationExclusionType potentially indicate which validations to include / exclude. - * @param sampleRenameMap map of file -> new sample name used when doing on-the-fly sample renaming - * - * @return A list of reference-ordered data sources. - */ - private List getReferenceOrderedDataSources(final Collection referenceMetaDataFiles, - final SAMSequenceDictionary sequenceDictionary, - final GenomeLocParser genomeLocParser, - final ValidationExclusion.TYPE validationExclusionType, - final Map sampleRenameMap) { - final RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, validationExclusionType, - getArguments().disableAutoIndexCreationAndLockingWhenReadingRods, - sampleRenameMap); - - final List dataSources = new ArrayList(); - for (RMDTriplet fileDescriptor : referenceMetaDataFiles) - dataSources.add(new ReferenceOrderedDataSource(fileDescriptor, - builder, - sequenceDictionary, - genomeLocParser, - flashbackData())); - - return dataSources; - } - - /** - * Returns the SAM File Header from the input reads' data source file - * @return the SAM File Header from the input reads' data source file - */ - public SAMFileHeader getSAMFileHeader() { - return readsDataSource.getHeader(); - } - - public boolean lenientVCFProcessing() { - return lenientVCFProcessing(argCollection.unsafe); - } - - public static boolean lenientVCFProcessing(final ValidationExclusion.TYPE val) { - return val == ValidationExclusion.TYPE.ALL - || val == ValidationExclusion.TYPE.LENIENT_VCF_PROCESSING; - } - - /** - * Returns the unmerged SAM file header for an individual reader. - * @param reader The reader. - * @return Header for that reader or null if not available. - */ - public SAMFileHeader getSAMFileHeader(SAMReaderID reader) { - return readsDataSource == null ? null : readsDataSource.getHeader(reader); - } - - /** - * Returns an ordered list of the unmerged SAM file headers known to this engine. - * @return list of header for each input SAM file, in command line order - */ - public List getSAMFileHeaders() { - final List headers = new ArrayList(); - for ( final SAMReaderID id : getReadsDataSource().getReaderIDs() ) { - headers.add(getReadsDataSource().getHeader(id)); - } - return headers; - } - - /** - * Gets the master sequence dictionary for this GATK engine instance - * @return a never-null dictionary listing all of the contigs known to this engine instance - */ - public SAMSequenceDictionary getMasterSequenceDictionary() { - return getReferenceDataSource().getReference().getSequenceDictionary(); - } - - /** - * Returns data source object encapsulating all essential info and handlers used to traverse - * reads; header merger, individual file readers etc can be accessed through the returned data source object. - * - * @return the reads data source - */ - public SAMDataSource getReadsDataSource() { - return this.readsDataSource; - } - - /** - * Sets the collection of GATK main application arguments. - * - * @param argCollection the GATK argument collection - */ - public void setArguments(GATKArgumentCollection argCollection) { - this.argCollection = argCollection; - } - - /** - * Gets the collection of GATK main application arguments. - * - * @return the GATK argument collection - */ - public GATKArgumentCollection getArguments() { - return this.argCollection; - } - - /** - * Get the list of intervals passed to the engine. - * @return List of intervals, or null if no intervals are in use - */ - public GenomeLocSortedSet getIntervals() { - return this.intervals; - } - - /** - * Get the list of regions of the genome being processed. If the user - * requested specific intervals, return those, otherwise return regions - * corresponding to the entire genome. Never returns null. - * - * @return a non-null set of intervals being processed - */ - @Ensures("result != null") - public GenomeLocSortedSet getRegionsOfGenomeBeingProcessed() { - if ( getIntervals() == null ) - // if we don't have any intervals defined, create intervals from the reference itself - return GenomeLocSortedSet.createSetFromSequenceDictionary(getReferenceDataSource().getReference().getSequenceDictionary()); - else - return getIntervals(); - } - - /** - * Gets the list of filters employed by this engine. - * @return Collection of filters (actual instances) used by this engine. - */ - public Collection getFilters() { - return this.filters; - } - - /** - * Sets the list of filters employed by this engine. - * @param filters Collection of filters (actual instances) used by this engine. - */ - public void setFilters(Collection filters) { - this.filters = filters; - } - - /** - * Gets the filter manager for this engine. - * @return filter manager for this engine. - */ - protected FilterManager getFilterManager() { - return filterManager; - } - - /** - * Gets the input sources for this engine. - * @return input sources for this engine. - */ - protected Map getInputs() { - return inputs; - } - - /** - * Gets the output stubs for this engine. - * @return output stubs for this engine. - */ - protected Collection> getOutputs() { - return outputs; - } - - /** - * Returns data source objects encapsulating all rod data; - * individual rods can be accessed through the returned data source objects. - * - * @return the rods data sources, never {@code null}. - */ - public List getRodDataSources() { - return this.rodDataSources; - } - - /** - * Gets cumulative metrics about the entire run to this point. - * Returns a clone of this snapshot in time. - * @return cumulative metrics about the entire run at this point. ReadMetrics object is a unique instance and is - * owned by the caller; the caller can do with the object what they wish. - */ - public ReadMetrics getCumulativeMetrics() { - // todo -- probably shouldn't be lazy - if ( cumulativeMetrics == null ) - cumulativeMetrics = readsDataSource == null ? new ReadMetrics() : readsDataSource.getCumulativeReadMetrics(); - return cumulativeMetrics; - } - - /** - * Return the global ThreadEfficiencyMonitor, if there is one - * - * @return the monitor, or null if none is active - */ - public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() { - return threadEfficiencyMonitor; - } - - // ------------------------------------------------------------------------------------- - // - // code for working with Samples database - // - // ------------------------------------------------------------------------------------- - - public SampleDB getSampleDB() { - return this.sampleDB; - } - - public Map getApproximateCommandLineArguments(Object... argumentProviders) { - return CommandLineUtils.getApproximateCommandLineArguments(parsingEngine,argumentProviders); - } - - public String createApproximateCommandLineArgumentString(Object... argumentProviders) { - return CommandLineUtils.createApproximateCommandLineArgumentString(parsingEngine,argumentProviders); - } - - // ------------------------------------------------------------------------------------- - // - // code for working with progress meter - // - // ------------------------------------------------------------------------------------- - - /** - * Register the global progress meter with this engine - * - * Calling this function more than once will result in an IllegalStateException - * - * @param meter a non-null progress meter - */ - public void registerProgressMeter(final ProgressMeter meter) { - if ( meter == null ) throw new IllegalArgumentException("Meter cannot be null"); - if ( progressMeter != null ) throw new IllegalStateException("Progress meter already set"); - - progressMeter = meter; - } - - /** - * Get the progress meter being used by this engine. May be null if no meter has been registered yet - * @return a potentially null pointer to the progress meter - */ - public ProgressMeter getProgressMeter() { - return progressMeter; - } - - /** - * Does the current runtime in unit exceed the runtime limit, if one has been provided? - * - * @return false if not limit was requested or if runtime <= the limit, true otherwise - */ - public boolean exceedsRuntimeLimit() { - if ( progressMeter == null ) - // not yet initialized or not set because of testing - return false; - - if ( getArguments().maxRuntime == NO_RUNTIME_LIMIT ) - return false; - else { - final long runtime = progressMeter.getRuntimeInNanosecondsUpdatedPeriodically(); - if ( runtime < 0 ) throw new IllegalArgumentException("runtime must be >= 0 but got " + runtime); - final long maxRuntimeNano = getRuntimeLimitInNanoseconds(); - return runtime > maxRuntimeNano; - } - } - - /** - * @return the runtime limit in nanoseconds, or -1 if no limit was specified - */ - public long getRuntimeLimitInNanoseconds() { - return runtimeLimitInNanoseconds; - } - - /** - * Setup the runtime limits for this engine, updating the runtimeLimitInNanoseconds - * as appropriate - * - * @param args the GATKArgumentCollection to retrieve our runtime limits from - */ - private void setupRuntimeLimits(final GATKArgumentCollection args) { - if ( args.maxRuntime == NO_RUNTIME_LIMIT ) - runtimeLimitInNanoseconds = -1; - else if (args.maxRuntime < 0 ) - throw new UserException.BadArgumentValue("maxRuntime", "must be >= 0 or == -1 (meaning no limit) but received negative value " + args.maxRuntime); - else { - runtimeLimitInNanoseconds = TimeUnit.NANOSECONDS.convert(args.maxRuntime, args.maxRuntimeUnits); - } - } - - /** - * Returns the sample list including all samples. - * @return never {@code null}. - */ - public SampleList getSampleList() { - return new IndexedSampleList(getSampleDB().getSampleNames()); - } - - /** - * Returns the sample list including samples in read inputs. - * @return never {@code null}. - */ - public SampleList getReadSampleList() { - return new IndexedSampleList(SampleUtils.getSAMFileSamples(getSAMFileHeader())); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/ReadProperties.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/ReadProperties.java deleted file mode 100644 index 6ee9ad3a4..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/ReadProperties.java +++ /dev/null @@ -1,198 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMFileReader; -import htsjdk.samtools.ValidationStringency; -import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; -import org.broadinstitute.gatk.engine.datasources.reads.SAMReaderID; -import org.broadinstitute.gatk.engine.downsampling.DownsamplingMethod; -import org.broadinstitute.gatk.engine.filters.ReadFilter; -import org.broadinstitute.gatk.engine.iterators.ReadTransformer; - -import java.util.Collection; -import java.util.List; -/** - * User: hanna - * Date: May 14, 2009 - * Time: 4:06:26 PM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** - * A data structure containing information about the reads data sources as well as - * information about how they should be downsampled, sorted, and filtered. - */ -public class ReadProperties { - private final Collection readers; - private final SAMFileHeader header; - private final SAMFileHeader.SortOrder sortOrder; - private final ValidationStringency validationStringency; - private final DownsamplingMethod downsamplingMethod; - private final ValidationExclusion exclusionList; - private final Collection supplementalFilters; - private final List readTransformers; - private final boolean keepUniqueReadListInLIBS; - private final boolean includeReadsWithDeletionAtLoci; - private final boolean useOriginalBaseQualities; - private final byte defaultBaseQualities; - - /** - * Return true if the walker wants to see reads that contain deletions when looking at locus pileups - * - * @return - */ - public boolean includeReadsWithDeletionAtLoci() { - return includeReadsWithDeletionAtLoci; - } - - public boolean keepUniqueReadListInLIBS() { - return keepUniqueReadListInLIBS; - } - - /** - * Gets a list of the files acting as sources of reads. - * @return A list of files storing reads data. - */ - public Collection getSAMReaderIDs() { - return readers; - } - - /** - * Gets the sam file header - * @return the sam file header - */ - public SAMFileHeader getHeader() { - return header; - } - - /** - * Gets the sort order of the reads - * @return the sort order of the reads - */ - public SAMFileHeader.SortOrder getSortOrder() { - return sortOrder; - } - - /** - * How strict should validation be? - * @return Stringency of validation. - */ - public ValidationStringency getValidationStringency() { - return validationStringency; - } - - /** - * Gets the method and parameters used when downsampling reads. - * @return Downsample fraction. - */ - public DownsamplingMethod getDownsamplingMethod() { - return downsamplingMethod; - } - - /** - * Return whether to 'verify' the reads as we pass through them. - * @return Whether to verify the reads. - */ - public ValidationExclusion getValidationExclusionList() { - return exclusionList; - } - - public Collection getSupplementalFilters() { - return supplementalFilters; - } - - - public List getReadTransformers() { - return readTransformers; - } - - /** - * Return whether to use original base qualities. - * @return Whether to use original base qualities. - */ - public boolean useOriginalBaseQualities() { - return useOriginalBaseQualities; - } - - /** - * @return Default base quality value to fill reads missing base quality information. - */ - public byte defaultBaseQualities() { - return defaultBaseQualities; - } - - /** - * Extract the command-line arguments having to do with reads input - * files and store them in an easy-to-work-with package. Constructor - * is package protected. - * @param samFiles list of reads files. - * @param header sam file header. - * @param useOriginalBaseQualities True if original base qualities should be used. - * @param strictness Stringency of reads file parsing. - * @param downsamplingMethod Method for downsampling reads at a given locus. - * @param exclusionList what safety checks we're willing to let slide - * @param supplementalFilters additional filters to dynamically apply. - * @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method - * will explicitly list reads with deletion over the current reference base; otherwise, only observed - * bases will be seen in the pileups, and the deletions will be skipped silently. - * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. - * @param keepUniqueReadListInLIBS If true, we will tell LocusIteratorByState to track the unique reads it sees - * This is really useful for ActiveRegionTraversals - */ - public ReadProperties( Collection samFiles, - SAMFileHeader header, - SAMFileHeader.SortOrder sortOrder, - boolean useOriginalBaseQualities, - ValidationStringency strictness, - DownsamplingMethod downsamplingMethod, - ValidationExclusion exclusionList, - Collection supplementalFilters, - List readTransformers, - boolean includeReadsWithDeletionAtLoci, - byte defaultBaseQualities, - final boolean keepUniqueReadListInLIBS) { - this.readers = samFiles; - this.header = header; - this.sortOrder = sortOrder; - this.validationStringency = strictness; - this.downsamplingMethod = downsamplingMethod == null ? DownsamplingMethod.NONE : downsamplingMethod; - this.exclusionList = exclusionList == null ? new ValidationExclusion() : exclusionList; - this.supplementalFilters = supplementalFilters; - this.readTransformers = readTransformers; - this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci; - this.useOriginalBaseQualities = useOriginalBaseQualities; - this.defaultBaseQualities = defaultBaseQualities; - this.keepUniqueReadListInLIBS = keepUniqueReadListInLIBS; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/WalkerManager.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/WalkerManager.java deleted file mode 100644 index fb9d48903..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/WalkerManager.java +++ /dev/null @@ -1,431 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine; - -import org.broadinstitute.gatk.engine.walkers.*; -import org.broadinstitute.gatk.utils.commandline.Hidden; -import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; -import org.broadinstitute.gatk.engine.downsampling.DownsamplingMethod; -import org.broadinstitute.gatk.engine.filters.FilterManager; -import org.broadinstitute.gatk.engine.filters.ReadFilter; -import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.utils.classloader.PluginManager; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.help.ResourceBundleExtractorDoclet; -import org.broadinstitute.gatk.utils.text.TextFormattingUtils; - -import java.lang.annotation.Annotation; -import java.util.*; - -/** - * Plugin manager that also provides various utilities for inspecting Walkers. - */ -public class WalkerManager extends PluginManager { - - /** - * A collection of help text for walkers and their enclosing packages. - */ - private ResourceBundle helpText; - - public WalkerManager() { - super(Walker.class,"walker",""); - helpText = TextFormattingUtils.loadResourceBundle("GATKText"); - } - - /** - * Get the list of walkers currently available to the GATK, organized - * by package. - * @param visibleWalkersOnly If true, return only the walker names that aren't hidden. - * @return Names of currently available walkers. - */ - public Map>> getWalkerNamesByPackage(boolean visibleWalkersOnly) { - Map>> walkersByPackage = new HashMap>>(); - for(Class walker: getPlugins()) { - if(visibleWalkersOnly && isHidden(walker)) - continue; - - // Extract the name for the package; if the walker is in the unnamed package, use the empty string - String walkerPackage = walker.getPackage() != null ? walker.getPackage().getName() : ""; - if(!walkersByPackage.containsKey(walkerPackage)) - walkersByPackage.put(walkerPackage,new ArrayList>()); - walkersByPackage.get(walkerPackage).add(walker); - } - return Collections.unmodifiableMap(walkersByPackage); - } - - /** - * Gets the display name for a given package. - * @param packageName Fully qualified package name. - * @return A suitable display name for the package. - */ - public String getPackageDisplayName(String packageName) { - // ...try to compute the override from the text of the package name, while accounting for - // unpackaged walkers. - String displayName = packageName.substring(packageName.lastIndexOf('.')+1); - if (displayName.trim().equals("")) displayName = ""; - return displayName; - } - - /** - * Gets the help text associated with a given package name. - * @param packageName Package for which to search for help text. - * @return Package help text, or "" if none exists. - */ - public String getPackageSummaryText(String packageName) { - String key = String.format("%s.%s",packageName, ResourceBundleExtractorDoclet.SUMMARY_TAGLET_NAME); - if(!helpText.containsKey(key)) - return ""; - return helpText.getString(key); - } - - /** - * Gets the summary help text associated with a given walker type. - * @param walkerType Type of walker for which to search for help text. - * @return Walker summary description, or "" if none exists. - */ - public String getWalkerSummaryText(Class walkerType) { - String walkerSummary = String.format("%s.%s",walkerType.getName(), ResourceBundleExtractorDoclet.SUMMARY_TAGLET_NAME); - if(!helpText.containsKey(walkerSummary)) - return ""; - return helpText.getString(walkerSummary); - } - - /** - * Gets the summary help text associated with a given walker type. - * @param walker Walker for which to search for help text. - * @return Walker summary description, or "" if none exists. - */ - public String getWalkerSummaryText(Walker walker) { - return getWalkerSummaryText(walker.getClass()); - } - - /** - * Gets the descriptive help text associated with a given walker type. - * @param walkerType Type of walker for which to search for help text. - * @return Walker full description, or "" if none exists. - */ - public String getWalkerDescriptionText(Class walkerType) { - String walkerDescription = String.format("%s.%s",walkerType.getName(), ResourceBundleExtractorDoclet.DESCRIPTION_TAGLET_NAME); - if(!helpText.containsKey(walkerDescription)) - return ""; - return helpText.getString(walkerDescription); - } - - /** - * Gets the descriptive help text associated with a given walker type. - * @param walker Walker for which to search for help text. - * @return Walker full description, or "" if none exists. - */ - public String getWalkerDescriptionText(Walker walker) { - return getWalkerDescriptionText(walker.getClass()); - } - - /** - * Retrieves the walker class given a walker name. - * @param walkerName Name of the walker. - * @return Class representing the walker. - */ - public Class getWalkerClassByName(String walkerName) { - return getPluginsByName().get(walkerName); - } - - /** - * Gets the data source for the provided walker. - * @param walkerClass The class of the walker. - * @return Which type of data source to traverse over...reads or reference? - */ - public static DataSource getWalkerDataSource(Class walkerClass) { - By byDataSource = walkerClass.getAnnotation(By.class); - if( byDataSource == null ) - throw new ReviewedGATKException("Unable to find By annotation for walker class " + walkerClass.getName()); - return byDataSource.value(); - } - - /** - * Gets the data source for the provided walker. - * @param walker The walker. - * @return Which type of data source to traverse over...reads or reference? - */ - public static DataSource getWalkerDataSource(Walker walker) { - return getWalkerDataSource(walker.getClass()); - } - - /** - * Get a list of RODs allowed by the walker. - * @param walkerClass Class of the walker to query. - * @return The list of allowed reference meta data. - */ - public static List getAllowsMetaData(Class walkerClass) { - return Collections.emptyList(); - } - - /** - * Determine whether the given walker supports the given data source. - * @param walkerClass Class of the walker to query. - * @param dataSource Source to check for . - * @return True if the walker forbids this data type. False otherwise. - */ - public static boolean isAllowed(Class walkerClass, DataSource dataSource) { - Allows allowsDataSource = getWalkerAllowed(walkerClass); - - // Allows is less restrictive than requires. If an allows - // clause is not specified, any kind of data is allowed. - if( allowsDataSource == null ) - return true; - - return Arrays.asList(allowsDataSource.value()).contains(dataSource); - } - - /** - * Determine whether the given walker supports the given data source. - * @param walker Walker to query. - * @param dataSource Source to check for . - * @return True if the walker forbids this data type. False otherwise. - */ - public static boolean isAllowed(Walker walker, DataSource dataSource) { - return isAllowed(walker.getClass(), dataSource); - } - - /** - * Determine whether the given walker supports the given reference ordered data. - * @param walkerClass Class of the walker to query. - * @param rod Source to check. - * @return True if the walker forbids this data type. False otherwise. - */ - public static boolean isAllowed(Class walkerClass, ReferenceOrderedDataSource rod) { - return true; - } - - /** - * Determine whether the given walker supports the given reference ordered data. - * @param walker Walker to query. - * @param rod Source to check. - * @return True if the walker forbids this data type. False otherwise. - */ - public static boolean isAllowed(Walker walker, ReferenceOrderedDataSource rod) { - return isAllowed(walker.getClass(), rod); - } - - /** - * Determine whether the given walker requires the given data source. - * @param walkerClass Class of the walker to query. - * @param dataSource Source to check for. - * @return True if the walker allows this data type. False otherwise. - */ - public static boolean isRequired(Class walkerClass, DataSource dataSource) { - Requires requiresDataSource = getWalkerRequirements(walkerClass); - return Arrays.asList(requiresDataSource.value()).contains(dataSource); - } - - /** - * Determine whether the given walker requires the given data source. - * @param walker Walker to query. - * @param dataSource Source to check for. - * @return True if the walker allows this data type. False otherwise. - */ - public static boolean isRequired(Walker walker, DataSource dataSource) { - return isRequired(walker.getClass(), dataSource); - } - - /** - * Get a list of RODs required by the walker. - * @param walkerClass Class of the walker to query. - * @return The list of required reference meta data. - */ - public static List getRequiredMetaData(Class walkerClass) { - return Collections.emptyList(); - } - - /** - * Get a list of RODs required by the walker. - * @param walker Walker to query. - * @return The list of required reference meta data. - */ - public static List getRequiredMetaData(Walker walker) { - return getRequiredMetaData(walker.getClass()); - } - - /** - * Reports whether this walker type is hidden -- in other words, whether it'll appear in the help output. - * @param walkerType Class to test for visibility. - * @return True if the walker should be hidden. False otherwise. - */ - public static boolean isHidden(Class walkerType) { - return walkerType.isAnnotationPresent(Hidden.class); - } - - /** - * Extracts filters that the walker has requested be run on the dataset. - * @param walkerClass Class of the walker to inspect for filtering requests. - * @param filterManager Manages the creation of filters. - * @return A non-empty list of filters to apply to the reads. - */ - public static List getReadFilters(Class walkerClass, FilterManager filterManager) { - List filters = new ArrayList(); - for(Class filterType: getReadFilterTypes(walkerClass)) - filters.add(filterManager.createFilterByType(filterType)); - return filters; - } - - /** - * Extracts filters that the walker has requested be run on the dataset. - * @param walker Walker to inspect for filtering requests. - * @param filterManager Manages the creation of filters. - * @return A non-empty list of filters to apply to the reads. - */ - public static List getReadFilters(Walker walker, FilterManager filterManager) { - return getReadFilters(walker.getClass(), filterManager); - } - - /** - * Gets the type of downsampling method requested by the walker. If an alternative - * downsampling method is specified on the command-line, the command-line version will - * be used instead. - * @param walker The walker to interrogate. - * @return The downsampling method, as specified by the walker. Null if none exists. - */ - public static DownsamplingMethod getDownsamplingMethod( Walker walker ) { - return getDownsamplingMethod(walker.getClass()); - } - - /** - * Gets the type of downsampling method requested by the walker. If an alternative - * downsampling method is specified on the command-line, the command-line version will - * be used instead. - * @param walkerClass The class of the walker to interrogate. - * @return The downsampling method, as specified by the walker. Null if none exists. - */ - public static DownsamplingMethod getDownsamplingMethod( Class walkerClass ) { - DownsamplingMethod downsamplingMethod = null; - - if( walkerClass.isAnnotationPresent(Downsample.class) ) { - Downsample downsampleParameters = walkerClass.getAnnotation(Downsample.class); - DownsampleType type = downsampleParameters.by(); - Integer toCoverage = downsampleParameters.toCoverage() >= 0 ? downsampleParameters.toCoverage() : null; - Double toFraction = downsampleParameters.toFraction() >= 0.0d ? downsampleParameters.toFraction() : null; - downsamplingMethod = new DownsamplingMethod(type, toCoverage, toFraction); - } - - return downsamplingMethod; - } - - public static T getWalkerAnnotation(final Walker walker, final Class clazz) { - return walker.getClass().getAnnotation(clazz); - } - - public static ReadTransformer.ApplicationTime getBAQApplicationTime(Walker walker) { - return walker.getClass().getAnnotation(BAQMode.class).ApplicationTime(); - } - - /** - * Create a name for this type of walker. - * - * @param walkerType The type of walker. - * @return A name for this type of walker. - */ - @Override - public String getName(Class walkerType) { - String walkerName = ""; - - if (walkerType.getAnnotation(WalkerName.class) != null) - walkerName = ((WalkerName)walkerType.getAnnotation(WalkerName.class)).value().trim(); - else - walkerName = super.getName(walkerType); - - return walkerName; - } - - /** - * Utility to get the requires attribute from the walker. - * Throws an exception if requirements are missing. - * @param walkerClass Class of the walker to query for required data. - * @return Required data attribute. - */ - private static Requires getWalkerRequirements(Class walkerClass) { - Requires requiresDataSource = walkerClass.getAnnotation(Requires.class); - if( requiresDataSource == null ) - throw new ReviewedGATKException( "Unable to find data types required by walker class " + walkerClass.getName()); - return requiresDataSource; - } - - /** - * Utility to get the requires attribute from the walker. - * Throws an exception if requirements are missing. - * @param walker Walker to query for required data. - * @return Required data attribute. - */ - private static Requires getWalkerRequirements(Walker walker) { - return getWalkerRequirements(walker.getClass()); - } - - /** - * Utility to get the forbidden attribute from the walker. - * @param walkerClass Class of the walker to query for required data. - * @return Required data attribute. Null if forbidden info isn't present. - */ - private static Allows getWalkerAllowed(Class walkerClass) { - Allows allowsDataSource = walkerClass.getAnnotation(Allows.class); - return allowsDataSource; - } - - /** - * Utility to get the forbidden attribute from the walker. - * @param walker Walker to query for required data. - * @return Required data attribute. Null if forbidden info isn't present. - */ - private static Allows getWalkerAllowed(Walker walker) { - return getWalkerAllowed(walker.getClass()); - } - - /** - * Gets the list of filtering classes specified as walker annotations. - * @param walkerClass Class of the walker to inspect. - * @return An array of types extending from SamRecordFilter. Will never be null. - */ - public static Collection> getReadFilterTypes(Class walkerClass) { - List> filterTypes = new ArrayList>(); - while(walkerClass != null) { - if(walkerClass.isAnnotationPresent(ReadFilters.class)) { - for ( Class c : walkerClass.getAnnotation(ReadFilters.class).value() ) { - if( !filterTypes.contains(c) ) - filterTypes.add(c); - } - } - walkerClass = walkerClass.getSuperclass(); - } - return filterTypes; - } - - /** - * Gets the list of filtering classes specified as walker annotations. - * @param walker The walker to inspect. - * @return An array of types extending from SamRecordFilter. Will never be null. - */ - public static Collection> getReadFilterTypes(Walker walker) { - return getReadFilterTypes(walker.getClass()); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java deleted file mode 100644 index 05834f71b..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java +++ /dev/null @@ -1,628 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.arguments; - -import htsjdk.samtools.ValidationStringency; -import org.broadinstitute.gatk.utils.commandline.*; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; -import org.broadinstitute.gatk.engine.downsampling.DownsamplingMethod; -import org.broadinstitute.gatk.engine.phonehome.GATKRunReport; -import org.broadinstitute.gatk.engine.samples.PedigreeValidationType; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.baq.BAQ; -import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; - -import java.io.File; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.TimeUnit; - -/** - * @author aaron - * @version 1.0 - */ -public class GATKArgumentCollection { - - /** the constructor */ - public GATKArgumentCollection() { - } - - // parameters and their defaults - /** - * An input file containing sequence data mapped to a reference, in SAM or BAM format, or a text file containing a - * list of input files (with extension .list). Note that the GATK requires an accompanying index for each SAM or - * BAM file. Please see our online documentation for more details on input formatting requirements. - */ - @Input(fullName = "input_file", shortName = "I", doc = "Input file containing sequence data (SAM or BAM)", required = false) - public List samFiles = new ArrayList<>(); - - @Hidden - @Argument(fullName = "showFullBamList",doc="Emit a log entry (level INFO) containing the full list of sequence data files to be included in the analysis (including files inside .bam.list files).") - public Boolean showFullBamList = false; - - @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false, minValue = 0) - public Integer readBufferSize = null; - - // -------------------------------------------------------------------------------------------------------------- - // - // GATKRunReport options - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * By default, GATK generates a run report that is uploaded to a cloud-based service. This report contains basic - * statistics about the run (which tool was used, whether the run was successful etc.) that help us for debugging - * and development. Up to version 3.2-2 the run report contains a record of the username and hostname associated - * with the run, but it does **NOT** contain any information that could be used to identify patient data. - * Nevertheless, if your data is subject to stringent confidentiality clauses (no outside communication) or if your - * run environment is not connected to the internet, you can disable the reporting system by seeting this option to - * "NO_ET". You will also need to request a key using the online request form on our website (se FAQs). - */ - @Argument(fullName = "phone_home", shortName = "et", doc="Run reporting mode", required = false) - public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.AWS; - /** - * Please see the "phone_home" argument below and the online documentation FAQs for more details on the key system - * and how to request a key. - */ - @Argument(fullName = "gatk_key", shortName = "K", doc="GATK key file required to run with -et NO_ET", required = false) - public File gatkKeyFile = null; - - /** - * The GATKRunReport supports (as of GATK 2.2) tagging GATK runs with an arbitrary tag that can be - * used to group together runs during later analysis. One use of this capability is to tag runs as GATK - * performance tests, so that the performance of the GATK over time can be assessed from the logs directly. - * - * Note that the tags do not conform to any ontology, so you are free to use any tags that you might find - * meaningful. - */ - @Argument(fullName = "tag", shortName = "tag", doc="Tag to identify this GATK run as part of a group of runs", required = false) - public String tag = "NA"; - - // -------------------------------------------------------------------------------------------------------------- - // - // General features - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * Reads that fail the specified filters will not be used in the analysis. Multiple filters can be specified separately, - * e.g. you can do -rf MalformedRead -rf BadCigar and so on. Available read filters are listed in the online tool - * documentation. Note that the read name format is e.g. MalformedReadFilter, but at the command line the filter - * name should be given without the Filter suffix; e.g. -rf MalformedRead (NOT -rf MalformedReadFilter, which is not - * recognized by the program). Note also that some read filters are applied by default for some analysis tools; this - * is specified in each tool's documentation. The default filters cannot be disabled. - */ - @Argument(fullName = "read_filter", shortName = "rf", doc = "Filters to apply to reads before analysis", required = false) - public final List readFilters = new ArrayList<>(); - - @ArgumentCollection - public IntervalArgumentCollection intervalArguments = new IntervalArgumentCollection(); - /** - * The reference genome against which the sequence data was mapped. The GATK requires an index file and a dictionary - * file accompanying the reference (please see the online documentation FAQs for more details on these files). Although - * this argument is indicated as being optional, almost all GATK tools require a reference in order to run. - * Note also that while GATK can in theory process genomes from any organism with any number of chromosomes or contigs, - * it is not designed to process draft genome assemblies and performance will decrease as the number of contigs in - * the reference increases. We strongly discourage the use of unfinished genome assemblies containing more than a few - * hundred contigs. Contig numbers in the thousands will most probably cause memory-related crashes. - */ - @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false) - public File referenceFile = null; - /** - * If this flag is enabled, the random numbers generated will be different in every run, causing GATK to behave non-deterministically. - */ - @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Use a non-deterministic random seed", required = false) - public boolean nonDeterministicRandomSeed = false; - /** - * To be used in the testing framework where dynamic parallelism can result in differing numbers of calls to the random generator. - */ - @Hidden - @Argument(fullName = "disableDithering",doc="Completely eliminates randomized dithering from rank sum tests.") - public boolean disableDithering = false; - /** - * This will truncate the run but without exiting with a failure. By default the value is interpreted in minutes, but this can be changed with the maxRuntimeUnits argument. - */ - @Argument(fullName = "maxRuntime", shortName = "maxRuntime", doc="Stop execution cleanly as soon as maxRuntime has been reached", required = false) - public long maxRuntime = GenomeAnalysisEngine.NO_RUNTIME_LIMIT; - - @Argument(fullName = "maxRuntimeUnits", shortName = "maxRuntimeUnits", doc="Unit of time used by maxRuntime", required = false) - public TimeUnit maxRuntimeUnits = TimeUnit.MINUTES; - - // -------------------------------------------------------------------------------------------------------------- - // - // Downsampling Arguments - // - // -------------------------------------------------------------------------------------------------------------- - /** - * There are several ways to downsample reads, i.e. to removed reads from the pile of reads that will be used for analysis. - * See the documentation of the individual downsampling options for details on how they work. Note that Many GATK tools - * specify a default downsampling type and target, but this behavior can be overridden from command line using the - * downsampling arguments. - */ - @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of read downsampling to employ at a given locus", required = false) - public DownsampleType downsamplingType = null; - /** - * Reads will be downsampled so the specified fraction remains; e.g. if you specify -dfrac 0.25, three-quarters of - * the reads will be removed, and the remaining one quarter will be used in the analysis. This method of downsampling - * is truly unbiased and random. It is typically used to simulate the effect of generating different amounts of - * sequence data for a given sample. For example, you can use this in a pilot experiment to evaluate how much target - * coverage you need to aim for in order to obtain enough coverage in all loci of interest. - */ - @Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction of reads to downsample to", required = false, minValue = 0.0, maxValue = 1.0) - public Double downsampleFraction = null; - - /** - * The principle of this downsampling type is to downsample reads to a given capping threshold coverage. Its purpose is to - * get rid of excessive coverage, because above a certain depth, having additional data is not informative and imposes - * unreasonable computational costs. The downsampling process takes two different forms depending on the type of - * analysis it is used with. - * - * For locus-based traversals (LocusWalkers like UnifiedGenotyper and ActiveRegionWalkers like HaplotypeCaller), - * downsample_to_coverage controls the maximum depth of coverage at each locus. For read-based traversals - * (ReadWalkers like BaseRecalibrator), it controls the maximum number of reads sharing the same alignment start - * position. For ReadWalkers you will typically need to use much lower dcov values than you would with LocusWalkers - * to see an effect. Note that this downsampling option does not produce an unbiased random sampling from all available - * reads at each locus: instead, the primary goal of the to-coverage downsampler is to maintain an even representation - * of reads from all alignment start positions when removing excess coverage. For a truly unbiased random sampling of - * reads, use -dfrac instead. Also note that the coverage target is an approximate goal that is not guaranteed to be - * met exactly: the downsampling algorithm will under some circumstances retain slightly more or less coverage than - * requested. - */ - @Argument(fullName = "downsample_to_coverage", shortName = "dcov", - doc = "Target coverage threshold for downsampling to coverage", - required = false, minValue = 0) - public Integer downsampleCoverage = null; - - /** - * Gets the downsampling method explicitly specified by the user. If the user didn't specify - * a default downsampling mechanism, return the default. - * @return The explicitly specified downsampling mechanism, or the default if none exists. - */ - public DownsamplingMethod getDownsamplingMethod() { - if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null ) - return null; - - return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction); - } - - /** - * Set the downsampling method stored in the argument collection so that it is read back out when interrogating the command line arguments. - * @param method The downsampling mechanism. - */ - public void setDownsamplingMethod(DownsamplingMethod method) { - if (method == null) - throw new IllegalArgumentException("method is null"); - - downsamplingType = method.type; - downsampleCoverage = method.toCoverage; - downsampleFraction = method.toFraction; - } - - // -------------------------------------------------------------------------------------------------------------- - // - // BAQ arguments - // - // -------------------------------------------------------------------------------------------------------------- - @Argument(fullName = "baq", shortName="baq", doc="Type of BAQ calculation to apply in the engine", required = false) - public BAQ.CalculationMode BAQMode = BAQ.CalculationMode.OFF; - /** - * Phred-scaled gap open penalty for BAQ calculation. Although the default value is 40, a value of 30 may be better for whole genome call sets. - */ - @Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty", required = false, minValue = 0) - public double BAQGOP = BAQ.DEFAULT_GOP; - - // -------------------------------------------------------------------------------------------------------------- - // - // refactor NDN cigar string arguments - // - // -------------------------------------------------------------------------------------------------------------- - /** - * This flag tells GATK to refactor cigar string with NDN elements to one element. It intended primarily for use in - * a RNAseq pipeline since the problem might come up when using RNAseq aligner such as Tophat2 with provided transcriptoms. - * You should only use this if you know that your reads have that problem. - */ - @Argument(fullName = "refactor_NDN_cigar_string", shortName = "fixNDN", doc = "refactor cigar string with NDN elements to one element", required = false) - public boolean REFACTOR_NDN_CIGAR_READS = false; - - // -------------------------------------------------------------------------------------------------------------- - // - // quality encoding checking arguments - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * By default the GATK assumes that base quality scores start at Q0 == ASCII 33 according to the SAM specification. - * However, encoding in some datasets (especially older Illumina ones) starts at Q64. This argument will fix the - * encodings on the fly (as the data is read in) by subtracting 31 from every quality score. Note that this argument should - * NEVER be used by default; you should only use it when you have confirmed that the quality scores in your data are - * not in the correct encoding. - */ - @Argument(fullName = "fix_misencoded_quality_scores", shortName="fixMisencodedQuals", doc="Fix mis-encoded base quality scores", required = false) - public boolean FIX_MISENCODED_QUALS = false; - /** - * This flag tells GATK to ignore warnings when encountering base qualities that are too high and that seemingly - * indicate a problem with the base quality encoding of the BAM file. You should only use this if you really know - * what you are doing; otherwise you could seriously mess up your data and ruin your analysis. - */ - @Argument(fullName = "allow_potentially_misencoded_quality_scores", shortName="allowPotentiallyMisencodedQuals", doc="Ignore warnings about base quality score encoding", required = false) - public boolean ALLOW_POTENTIALLY_MISENCODED_QUALS = false; - /** - * This flag tells GATK to use the original base qualities (that were in the data before BQSR/recalibration) which - * are stored in the OQ tag, if they are present, rather than use the post-recalibration quality scores. If no OQ - * tag is present for a read, the standard qual score will be used. - */ - @Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "Use the base quality scores from the OQ tag", required=false) - public Boolean useOriginalBaseQualities = false; - /** - * If reads are missing some or all base quality scores, this value will be used for all base quality scores. - * By default this is set to -1 to disable default base quality assignment. - */ - @Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "Assign a default base quality", required=false, minValue = 0, maxValue = Byte.MAX_VALUE) - public byte defaultBaseQualities = -1; - - // -------------------------------------------------------------------------------------------------------------- - // - // performance log arguments - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * The file name for the GATK performance log output, or null if you don't want to generate the - * detailed performance logging table. This table is suitable for importing into R or any - * other analysis software that can read tsv files. - */ - @Argument(fullName = "performanceLog", shortName="PF", doc="Write GATK runtime performance log to this file", required = false) - public File performanceLog = null; - - // -------------------------------------------------------------------------------------------------------------- - // - // BQSR arguments - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * Enables on-the-fly recalibrate of base qualities, intended primarily for use with BaseRecalibrator and PrintReads - * (see Best Practices workflow documentation). The covariates tables are produced by the BaseRecalibrator tool. - * Please be aware that you should only run recalibration with the covariates file created on the same input bam(s). - */ - @Input(fullName="BQSR", shortName="BQSR", required=false, doc="Input covariates table file for on-the-fly base quality score recalibration") - public File BQSR_RECAL_FILE = null; - - /** - * Turns on the base quantization module. It requires a recalibration report (-BQSR). - * - * A value of 0 here means "do not quantize". - * Any value greater than zero will be used to recalculate the quantization using that many levels. - * Negative values mean that we should quantize using the recalibration report's quantization level. - */ - @Hidden - @Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels (with -BQSR)", required=false) - public int quantizationLevels = 0; - - /** - * Turns off printing of the base insertion and base deletion tags when using the -BQSR argument. Only the base substitution qualities will be produced. - */ - @Argument(fullName="disable_indel_quals", shortName = "DIQ", doc = "Disable printing of base insertion and deletion tags (with -BQSR)", required=false) - public boolean disableIndelQuals = false; - - /** - * By default, the OQ tag in not emitted when using the -BQSR argument. Use this flag to include OQ tags in the output BAM file. - * Note that this may results in significant file size increase. - */ - @Argument(fullName="emit_original_quals", shortName = "EOQ", doc = "Emit the OQ tag with the original base qualities (with -BQSR)", required=false) - public boolean emitOriginalQuals = false; - - /** - * This flag tells GATK not to modify quality scores less than this value. Instead they will be written out unmodified in the recalibrated BAM file. - * In general it's unsafe to change qualities scores below < 6, since base callers use these values to indicate random or bad bases. - * For example, Illumina writes Q2 bases when the machine has really gone wrong. This would be fine in and of itself, - * but when you select a subset of these reads based on their ability to align to the reference and their dinucleotide effect, - * your Q2 bin can be elevated to Q8 or Q10, leading to issues downstream. - */ - @Argument(fullName = "preserve_qscores_less_than", shortName = "preserveQ", doc = "Don't recalibrate bases with quality scores less than this threshold (with -BQSR)", required = false, minValue = 0, minRecommendedValue = QualityUtils.MIN_USABLE_Q_SCORE) - public int PRESERVE_QSCORES_LESS_THAN = QualityUtils.MIN_USABLE_Q_SCORE; - /** - * If specified, this value will be used as the prior for all mismatch quality scores instead of the actual reported quality score. - */ - @Argument(fullName = "globalQScorePrior", shortName = "globalQScorePrior", doc = "Global Qscore Bayesian prior to use for BQSR", required = false) - public double globalQScorePrior = -1.0; - - - // -------------------------------------------------------------------------------------------------------------- - // - // Other utility arguments - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * Keep in mind that if you set this to LENIENT, we may refuse to provide you with support if anything goes wrong. - */ - @Argument(fullName = "validation_strictness", shortName = "S", doc = "How strict should we be with validation", required = false) - public ValidationStringency strictnessLevel = ValidationStringency.SILENT; - /** - * Some tools keep program records in the SAM header by default. Use this argument to override that behavior and discard program records for the SAM header. - */ - @Argument(fullName = "remove_program_records", shortName = "rpr", doc = "Remove program records from the SAM header", required = false) - public boolean removeProgramRecords = false; - /** - * Some tools discard program records from the SAM header by default. Use this argument to override that behavior and keep program records in the SAM header. - */ - @Argument(fullName = "keep_program_records", shortName = "kpr", doc = "Keep program records in the SAM header", required = false) - public boolean keepProgramRecords = false; - - /** - * On-the-fly sample renaming works only with single-sample BAM and VCF files. Each line of the mapping file must - * contain the absolute path to a BAM or VCF file, followed by whitespace, followed by the new sample name for that - * BAM or VCF file. The sample name may contain non-tab whitespace, but leading or trailing whitespace will be - * ignored. The engine will verify at runtime that each BAM/VCF targeted for sample renaming has only a single - * sample specified in its header (though, in the case of BAM files, there may be multiple read groups for that - * sample). - */ - @Advanced - @Argument(fullName = "sample_rename_mapping_file", shortName = "sample_rename_mapping_file", doc = "Rename sample IDs on-the-fly at runtime using the provided mapping file", required = false) - public File sampleRenameMappingFile = null; - - /** - * For expert users only who know what they are doing. We do not support usage of this argument, so we may refuse to help you if you use it and something goes wrong. The one exception to this rule is ALLOW_N_CIGAR_READS, which is necessary for RNAseq analysis. - */ - @Argument(fullName = "unsafe", shortName = "U", doc = "Enable unsafe operations: nothing will be checked at runtime", required = false) - public ValidationExclusion.TYPE unsafe; - /** - * UNSAFE FOR GENERAL USE (FOR TEST SUITE USE ONLY). Disable both auto-generation of index files and index file locking - * when reading VCFs and other rods and an index isn't present or is out-of-date. The file locking necessary for auto index - * generation to work safely is prone to random failures/hangs on certain platforms, which makes it desirable to disable it - * for situations like test suite runs where the indices are already known to exist, however this option is unsafe in general - * because it allows reading from index files without first acquiring a lock. - */ - @Hidden - @Advanced - @Argument(fullName = "disable_auto_index_creation_and_locking_when_reading_rods", shortName = "disable_auto_index_creation_and_locking_when_reading_rods", - doc = "Disable both auto-generation of index files and index file locking", - required = false) - public boolean disableAutoIndexCreationAndLockingWhenReadingRods = false; - - @Hidden - @Argument(fullName = "no_cmdline_in_header", shortName = "no_cmdline_in_header", doc = "Don't output the usual VCF header tag with the command line. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.", - required = false) - public boolean disableCommandLineInVCF = false; - - @Argument(fullName = "sites_only", shortName = "sites_only", doc = "Just output sites without genotypes (i.e. only the first 8 columns of the VCF)", - required = false) - public boolean sitesOnlyVCF = false; - - /** - *

    The VCF specification permits missing records to be dropped from the end of FORMAT fields, so long as GT is always output. - * This option prevents GATK from performing that trimming.

    - * - *

    For example, given a FORMAT of

    GT:AD:DP:PL
    , GATK will by default emit
    ./.
    for a variant with - * no reads present (ie, the AD, DP, and PL fields are trimmed). If you specify -writeFullFormat, this record - * would be emitted as
    ./.:.:.:.

    - */ - @Argument(fullName = "never_trim_vcf_format_field", shortName = "writeFullFormat", doc = "Always output all the records in VCF FORMAT fields, even if some are missing", - required = false) - public boolean neverTrimVCFFormatField = false; - - @Hidden - @Argument(fullName = "bcf", shortName = "bcf", doc = "Force BCF output, regardless of the file's extension", - required = false) - public boolean forceBCFOutput = false; - - @Advanced - @Argument(fullName = "bam_compression", shortName = "compress", doc = "Compression level to use for writing BAM files (0 - 9, higher is more compressed)", - minValue = 0, maxValue = 9, required = false) - public Integer bamCompression = null; - - @Advanced - @Argument(fullName = "simplifyBAM", shortName = "simplifyBAM", - doc = "If provided, output BAM files will be simplified to include just key reads for downstream variation discovery analyses (removing duplicates, PF-, non-primary reads), as well stripping all extended tags from the kept reads except the read group identifier", - required = false) - public boolean simplifyBAM = false; - - @Argument(fullName = "disable_bam_indexing", doc = "Turn off on-the-fly creation of indices for output BAM files.", - required = false) - public boolean disableBAMIndexing = false; - - @Argument(fullName = "generate_md5", doc = "Enable on-the-fly creation of md5s for output BAM files.", - required = false) - public boolean enableBAMmd5 = false; - - // -------------------------------------------------------------------------------------------------------------- - // - // Multi-threading arguments - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * Data threads contains N cpu threads per data thread, and act as completely data parallel processing, increasing - * the memory usage of GATK by M data threads. Data threads generally scale extremely effectively, up to 24 cores. - * See online documentation FAQs for more information. - */ - @Argument(fullName = "num_threads", shortName = "nt", doc = "Number of data threads to allocate to this analysis", required = false, minValue = 1) - public Integer numberOfDataThreads = 1; - - /** - * Each CPU thread operates the map cycle independently, but may run into earlier scaling problems with IO than - * data threads. Has the benefit of not requiring X times as much memory per thread as data threads do, but rather - * only a constant overhead. See online documentation FAQs for more information. - */ - @Argument(fullName="num_cpu_threads_per_data_thread", shortName = "nct", doc="Number of CPU threads to allocate per data thread", required = false, minValue = 1) - public int numberOfCPUThreadsPerDataThread = 1; - - @Argument(fullName="num_io_threads", shortName = "nit", doc="Number of given threads to allocate to IO", required = false, minValue = 0) - @Hidden - public int numberOfIOThreads = 0; - - /** - * Enable GATK to monitor its own threading efficiency, at an itsy-bitsy tiny - * cost (< 0.1%) in runtime because of turning on the JavaBean. This is largely for - * debugging purposes. Note that this argument is not compatible with -nt, it only works with -nct. - */ - @Argument(fullName = "monitorThreadEfficiency", shortName = "mte", doc = "Enable threading efficiency monitoring", required = false) - public Boolean monitorThreadEfficiency = false; - - @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="Total number of BAM file handles to keep open simultaneously", required=false, minValue = 1) - public Integer numberOfBAMFileHandles = null; - /** - * This will filter out read groups matching : (e.g. SM:sample1) or a .txt file containing the filter strings one per line. - */ - @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Exclude read groups based on tags", required = false) - public List readGroupBlackList = null; - - // -------------------------------------------------------------------------------------------------------------- - // - // PED (pedigree) support - // - // -------------------------------------------------------------------------------------------------------------- - - /** - *

    Reads PED file-formatted tabular text files describing meta-data about the samples being - * processed in the GATK.

    - * - * - * - *

    The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory:

    - * - *
      - *
    • Family ID
    • - *
    • Individual ID
    • - *
    • Paternal ID
    • - *
    • Maternal ID
    • - *
    • Sex (1=male; 2=female; other=unknown)
    • - *
    • Phenotype
    • - *
    - * - *

    The IDs are alphanumeric: the combination of family and individual ID should uniquely identify a person. - * A PED file must have 1 and only 1 phenotype in the sixth column. The phenotype can be either a - * quantitative trait or an affection status column: GATK will automatically detect which type - * (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed).

    - * - *

    If an individual's sex is unknown, then any character other than 1 or 2 can be used.

    - * - *

    You can add a comment to a PED or MAP file by starting the line with a # character. The rest of that - * line will be ignored. Do not start any family IDs with this character therefore.

    - * - *

    Affection status should be coded:

    - * - *
      - *
    • -9 missing
    • - *
    • 0 missing
    • - *
    • 1 unaffected
    • - *
    • 2 affected
    • - *
    - * - *

    If any value outside of -9,0,1,2 is detected than the samples are assumed - * to phenotype values are interpreted as string phenotype values. In this case -9 uniquely - * represents the missing value.

    - * - *

    Genotypes (column 7 onwards) cannot be specified to the GATK.

    - * - *

    For example, here are two individuals (one row = one person):

    - * - *
    -     *   FAM001  1  0 0  1  2
    -     *   FAM001  2  0 0  1  2
    -     * 
    - * - *

    Each -ped argument can be tagged with NO_FAMILY_ID, NO_PARENTS, NO_SEX, NO_PHENOTYPE to - * tell the GATK PED parser that the corresponding fields are missing from the ped file.

    - * - *

    Note that most GATK walkers do not use pedigree information. Walkers that require pedigree - * data should clearly indicate so in their arguments and will throw errors if required pedigree - * information is missing.

    - */ - @Argument(fullName="pedigree", shortName = "ped", doc="Pedigree files for samples",required=false) - public List pedigreeFiles = Collections.emptyList(); - - /** - * Inline PED records (see -ped argument). Each -pedString STRING can contain one or more - * valid PED records (see -ped) separated by semi-colons. Supports all tags for each pedString - * as -ped supports - */ - @Argument(fullName="pedigreeString", shortName = "pedString", doc="Pedigree string for samples",required=false) - public List pedigreeStrings = Collections.emptyList(); - - /** - * How strict should we be in parsing the PED files? - */ - @Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="Validation strictness for pedigree information",required=false) - public PedigreeValidationType pedigreeValidationType = PedigreeValidationType.STRICT; - - // -------------------------------------------------------------------------------------------------------------- - // - // BAM indexing and sharding arguments - // - // -------------------------------------------------------------------------------------------------------------- - /** - * NO INTEGRATION TESTS are available. Use at your own risk. - */ - @Argument(fullName="allow_intervals_with_unindexed_bam",doc="Allow interval processing with an unsupported BAM",required=false) - @Hidden - public boolean allowIntervalsWithUnindexedBAM = false; - - // -------------------------------------------------------------------------------------------------------------- - // - // testing BCF2 - // - // -------------------------------------------------------------------------------------------------------------- - /** - * If provided, whenever we create a VCFWriter we will also write out a BCF file alongside it, for testing purposes. - */ - @Argument(fullName="generateShadowBCF",shortName = "generateShadowBCF",doc="Write a BCF copy of the output VCF",required=false) - @Hidden - public boolean generateShadowBCF = false; - // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed - - // -------------------------------------------------------------------------------------------------------------- - // - // VCF/BCF index parameters - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * Specify the Tribble indexing strategy to use for VCFs. - * - * LINEAR creates a LinearIndex with bins of equal width, specified by the Bin Width parameter - * INTERVAL creates an IntervalTreeIndex with bins with an equal amount of features, specified by the Features Per Bin parameter - * DYNAMIC_SEEK attempts to optimize for minimal seek time by choosing an appropriate strategy and parameter (user-supplied parameter is ignored) - * DYNAMIC_SIZE attempts to optimize for minimal index size by choosing an appropriate strategy and parameter (user-supplied parameter is ignored) - */ - @Argument(fullName="variant_index_type",shortName = "variant_index_type",doc="Type of IndexCreator to use for VCF/BCF indices",required=false) - @Advanced - public GATKVCFIndexType variant_index_type = GATKVCFUtils.DEFAULT_INDEX_TYPE; - /** - * This is either the bin width or the number of features per bin, depending on the indexing strategy - */ - @Argument(fullName="variant_index_parameter",shortName = "variant_index_parameter",doc="Parameter to pass to the VCF/BCF IndexCreator",required=false) - @Advanced - public int variant_index_parameter = GATKVCFUtils.DEFAULT_INDEX_PARAMETER; -} - diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/arguments/ValidationExclusion.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/arguments/ValidationExclusion.java deleted file mode 100644 index ccd4fdc44..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/arguments/ValidationExclusion.java +++ /dev/null @@ -1,67 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.arguments; - -import org.broadinstitute.gatk.utils.commandline.EnumerationArgumentDefault; - -import java.util.ArrayList; -import java.util.List; - - -public class ValidationExclusion { - - // our validation options - - public enum TYPE { - ALLOW_N_CIGAR_READS, // ignore the presence of N operators in CIGARs: do not blow up and process reads that contain one or more N operators. - // This exclusion does not have effect on reads that get filtered {@see MalformedReadFilter}. - ALLOW_UNINDEXED_BAM, // allow bam files that do not have an index; we'll traverse them using monolithic shard - ALLOW_UNSET_BAM_SORT_ORDER, // assume that the bam is sorted, even if the SO (sort-order) flag is not set - NO_READ_ORDER_VERIFICATION, // do not validate that the reads are in order as we take them from the bam file - ALLOW_SEQ_DICT_INCOMPATIBILITY, // allow dangerous, but not fatal, sequence dictionary incompabilities - LENIENT_VCF_PROCESSING, // allow non-standard values for standard VCF header lines. Don't worry about size differences between header and values, etc. - @EnumerationArgumentDefault // set the ALL value to the default value, so if they specify just -U, we get the ALL - ALL // do not check for all of the above conditions, DEFAULT - } - - // a storage for the passed in exclusions - List exclusions = new ArrayList(); - - public ValidationExclusion(List exclusionsList) { - exclusions.addAll(exclusionsList); - } - - public ValidationExclusion() {} - - /** - * do we contain the exclusion specified, or were we set to ALL - * @param t the exclusion case to test for - * @return true if we contain the exclusion or if we're set to ALL, false otherwise - */ - public boolean contains(TYPE t) { - return (exclusions.contains(TYPE.ALL) || exclusions.contains(t)); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/contexts/AlignmentContext.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/contexts/AlignmentContext.java deleted file mode 100644 index 6ac204865..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/contexts/AlignmentContext.java +++ /dev/null @@ -1,154 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.contexts; - -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.HasGenomeLocation; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.List; - -/** - * Useful class for forwarding on locusContext data from this iterator - * - * Created by IntelliJ IDEA. - * User: mdepristo - * Date: Feb 22, 2009 - * Time: 3:01:34 PM - * To change this template use File | Settings | File Templates. - */ -public class AlignmentContext implements HasGenomeLocation { - protected GenomeLoc loc = null; - protected ReadBackedPileup basePileup = null; - protected boolean hasPileupBeenDownsampled; - - /** - * The number of bases we've skipped over in the reference since the last map invocation. - * Only filled in by RodTraversals right now. By default, nothing is being skipped, so skippedBases == 0. - */ - private long skippedBases = 0; - - public AlignmentContext(GenomeLoc loc, ReadBackedPileup basePileup) { - this(loc, basePileup, 0, false); - } - - public AlignmentContext(GenomeLoc loc, ReadBackedPileup basePileup, boolean hasPileupBeenDownsampled) { - this(loc, basePileup, 0, hasPileupBeenDownsampled); - } - - public AlignmentContext(GenomeLoc loc, ReadBackedPileup basePileup, long skippedBases) { - this(loc, basePileup, skippedBases, false); - } - - public AlignmentContext(GenomeLoc loc, ReadBackedPileup basePileup, long skippedBases,boolean hasPileupBeenDownsampled ) { - if ( loc == null ) throw new ReviewedGATKException("BUG: GenomeLoc in Alignment context is null"); - if ( basePileup == null ) throw new ReviewedGATKException("BUG: ReadBackedPileup in Alignment context is null"); - if ( skippedBases < 0 ) throw new ReviewedGATKException("BUG: skippedBases is -1 in Alignment context"); - - this.loc = loc; - this.basePileup = basePileup; - this.skippedBases = skippedBases; - this.hasPileupBeenDownsampled = hasPileupBeenDownsampled; - } - - /** Returns base pileup over the current genomic location. Deprectated. Use getBasePileup() to make your intentions - * clear. - * @return - */ - @Deprecated - public ReadBackedPileup getPileup() { return basePileup; } - - /** Returns base pileup over the current genomic location. May return null if this context keeps only - * extended event (indel) pileup. - * @return - */ - public ReadBackedPileup getBasePileup() { - return basePileup; - } - - /** - * Returns true if any reads have been filtered out of the pileup due to excess DoC. - * @return True if reads have been filtered out. False otherwise. - */ - public boolean hasPileupBeenDownsampled() { return hasPileupBeenDownsampled; } - - /** - * get all of the reads within this context - * - * @return - */ - @Deprecated - //todo: unsafe and tailored for current usage only; both pileups can be null or worse, bot can be not null in theory - public List getReads() { return ( basePileup.getReads() ); } - - /** - * Are there any reads associated with this locus? - * - * @return - */ - public boolean hasReads() { - return basePileup != null && basePileup.getNumberOfElements() > 0 ; - } - - /** - * How many reads cover this locus? - * @return - */ - public int size() { - return basePileup.getNumberOfElements(); - } - - /** - * get a list of the equivalent positions within in the reads at Pos - * - * @return - */ - @Deprecated - public List getOffsets() { - return basePileup.getOffsets(); - } - - public String getContig() { return getLocation().getContig(); } - public long getPosition() { return getLocation().getStart(); } - public GenomeLoc getLocation() { return loc; } - - public void downsampleToCoverage(int coverage) { - basePileup = basePileup.getDownsampledPileup(coverage); - hasPileupBeenDownsampled = true; - } - - /** - * Returns the number of bases we've skipped over in the reference since the last map invocation. - * Only filled in by RodTraversals right now. A value of 0 indicates that no bases were skipped. - * - * @return the number of skipped bases - */ - public long getSkippedBases() { - return skippedBases; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/contexts/AlignmentContextUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/contexts/AlignmentContextUtils.java deleted file mode 100644 index afeb1e735..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/contexts/AlignmentContextUtils.java +++ /dev/null @@ -1,150 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.contexts; - -import htsjdk.samtools.SAMReadGroupRecord; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.pileup.*; - -import java.util.*; - -/** - * Useful utilities for storing different AlignmentContexts - * User: ebanks - */ -public class AlignmentContextUtils { - - // Definitions: - // COMPLETE = full alignment context - // FORWARD = reads on forward strand - // REVERSE = reads on forward strand - // - public enum ReadOrientation { COMPLETE, FORWARD, REVERSE } - - private AlignmentContextUtils() { - // cannot be instantiated - } - - /** - * Returns a potentially derived subcontext containing only forward, reverse, or in fact all reads - * in alignment context context. - * - * @param context - * @param type - * @return - */ - public static AlignmentContext stratify(AlignmentContext context, ReadOrientation type) { - switch(type) { - case COMPLETE: - return context; - case FORWARD: - return new AlignmentContext(context.getLocation(),context.getPileup().getPositiveStrandPileup()); - case REVERSE: - return new AlignmentContext(context.getLocation(),context.getPileup().getNegativeStrandPileup()); - default: - throw new ReviewedGATKException("Unable to get alignment context for type = " + type); - } - } - - public static Map splitContextBySampleName(AlignmentContext context) { - return splitContextBySampleName(context, null); - } - - /** - * Splits the given AlignmentContext into a StratifiedAlignmentContext per sample, but referencd by sample name instead - * of sample object. - * - * @param context the original pileup - * - * @return a Map of sample name to StratifiedAlignmentContext - * - **/ - public static Map splitContextBySampleName(AlignmentContext context, String assumedSingleSample) { - GenomeLoc loc = context.getLocation(); - HashMap contexts = new HashMap(); - - for(String sample: context.getPileup().getSamples()) { - ReadBackedPileup pileupBySample = context.getPileup().getPileupForSample(sample); - - // Don't add empty pileups to the split context. - if(pileupBySample.getNumberOfElements() == 0) - continue; - - if(sample != null) - contexts.put(sample, new AlignmentContext(loc, pileupBySample)); - else { - if(assumedSingleSample == null) { - throw new UserException.ReadMissingReadGroup(pileupBySample.iterator().next().getRead()); - } - contexts.put(assumedSingleSample,new AlignmentContext(loc, pileupBySample)); - } - } - - return contexts; - } - - /** - * Splits the AlignmentContext into one context per read group - * - * @param context the original pileup - * @return a Map of ReadGroup to AlignmentContext, or an empty map if context has no base pileup - * - **/ - public static Map splitContextByReadGroup(AlignmentContext context, Collection readGroups) { - HashMap contexts = new HashMap(); - - for (SAMReadGroupRecord rg : readGroups) { - ReadBackedPileup rgPileup = context.getBasePileup().getPileupForReadGroup(rg.getReadGroupId()); - if ( rgPileup != null ) // there we some reads for RG - contexts.put(rg, new AlignmentContext(context.getLocation(), rgPileup)); - } - - return contexts; - } - - public static Map splitContextBySampleName(ReadBackedPileup pileup) { - return splitContextBySampleName(new AlignmentContext(pileup.getLocation(), pileup)); - } - - - public static AlignmentContext joinContexts(Collection contexts) { - // validation - GenomeLoc loc = contexts.iterator().next().getLocation(); - for(AlignmentContext context: contexts) { - if(!loc.equals(context.getLocation())) - throw new ReviewedGATKException("Illegal attempt to join contexts from different genomic locations"); - } - - List pe = new ArrayList(); - for(AlignmentContext context: contexts) { - for(PileupElement pileupElement: context.basePileup) - pe.add(pileupElement); - } - return new AlignmentContext(loc, new ReadBackedPileupImpl(loc,pe)); - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/contexts/ReferenceContext.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/contexts/ReferenceContext.java deleted file mode 100644 index 201ea49fd..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/contexts/ReferenceContext.java +++ /dev/null @@ -1,217 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.contexts; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broadinstitute.gatk.utils.BaseUtils; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; - -/** - * The section of the reference that overlaps with the given - * read / locus. - * - * @author hanna - * @version 0.1 - */ -public class ReferenceContext { - /** - * Facilitates creation of new GenomeLocs. - */ - final private GenomeLocParser genomeLocParser; - - /** - * The locus. - */ - final private GenomeLoc locus; - - /** - * The window of reference information around the current locus. - */ - final private GenomeLoc window; - - /** - * The bases in the window around the current locus. If null, then bases haven't been fetched yet. - * Bases are always upper cased - */ - private byte[] basesCache = null; - - /** - * Lazy loader to fetch reference bases - */ - final private ReferenceContextRefProvider basesProvider; - - /** - * Interface to create byte[] contexts for lazy loading of the reference - */ - public static interface ReferenceContextRefProvider { - /** - * You must provide a routine that gets the byte[] bases that would have been passed into the - * ReferenceContext. The RC will handling caching. The value of this interface and routine is - * that it is only called when the bytes are actually requested by the walker, not up front. So - * if the walker doesn't need the refBases for whatever reason, there's no overhead to - * provide them. - * - * @return - */ - @Ensures({"result != null"}) - public byte[] getBases(); - } - - private static class ForwardingProvider implements ReferenceContextRefProvider { - byte[] bases; - - public ForwardingProvider( byte base ) { - this(new byte[] { base }); - } - - public ForwardingProvider( byte[] bases ) { - this.bases = bases; - } - - public byte[] getBases() { return bases; } - } - - /** - * Contructor for a simple, windowless reference context. - * @param locus locus of interest. - * @param base reference base at that locus. - */ - @Requires({ - "genomeLocParser != null", - "locus != null", - "locus.size() > 0"}) - public ReferenceContext( GenomeLocParser genomeLocParser, GenomeLoc locus, byte base ) { - this( genomeLocParser, locus, locus, new ForwardingProvider(base) ); - } - - @Requires({ - "genomeLocParser != null", - "locus != null", - "locus.size() > 0", - "window != null", - "window.size() > 0", - "bases != null && bases.length > 0"}) - public ReferenceContext( GenomeLocParser genomeLocParser, GenomeLoc locus, GenomeLoc window, byte[] bases ) { - this( genomeLocParser, locus, window, new ForwardingProvider(bases) ); - } - - @Requires({ - "genomeLocParser != null", - "locus != null", - "locus.size() > 0", - "window != null", - "window.size() > 0", - "basesProvider != null"}) - public ReferenceContext( GenomeLocParser genomeLocParser, GenomeLoc locus, GenomeLoc window, ReferenceContextRefProvider basesProvider ) { - this.genomeLocParser = genomeLocParser; - this.locus = locus; - this.window = window; - this.basesProvider = basesProvider; - } - - /** - * Utility function to load bases from the provider to the cache, if necessary - */ - @Ensures({ - "basesCache != null", - "old(basesCache) == null || old(basesCache) == basesCache"}) - private void fetchBasesFromProvider() { - if ( basesCache == null ) { - basesCache = basesProvider.getBases(); - - // must be an assertion that only runs when the bases are fetch to run in a reasonable amount of time - assert BaseUtils.isUpperCase(basesCache); - } - } - - /** - * @return The genome loc parser associated with this reference context - */ - @Ensures("result != null") - public GenomeLocParser getGenomeLocParser() { - return genomeLocParser; - } - - /** - * The locus currently being examined. - * @return The current locus. - */ - @Ensures("result != null") - public GenomeLoc getLocus() { - return locus; - } - - @Ensures("result != null") - public GenomeLoc getWindow() { - return window; - } - - /** - * Get the base at the given locus. - * @return The base at the given locus from the reference. - */ - public byte getBase() { - return getBases()[(locus.getStart() - window.getStart())]; - } - - /** - * All the bases in the window currently being examined. - * @return All bases available. If the window is of size [0,0], the array will - * contain only the base at the given locus. - */ - @Ensures({"result != null", "result.length > 0"}) - public byte[] getBases() { - fetchBasesFromProvider(); - return basesCache; - } - - /** - * All the bases in the window from the current base forward to the end of the window. - */ - @Ensures({"result != null", "result.length > 0"}) - public byte[] getForwardBases() { - final byte[] bases = getBases(); - final int mid = locus.getStart() - window.getStart(); - // todo -- warning of performance problem, especially if this is called over and over - return new String(bases).substring(mid).getBytes(); - } - - @Deprecated - public char getBaseAsChar() { - return (char)getBase(); - } - - /** - * Get the base at the given locus. - * @return The base at the given locus from the reference. - */ - @Deprecated() - public int getBaseIndex() { - return BaseUtils.simpleBaseToBaseIndex(getBase()); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/AllLocusView.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/AllLocusView.java deleted file mode 100644 index 56ecce2ef..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/AllLocusView.java +++ /dev/null @@ -1,169 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.iterators.GenomeLocusIterator; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.Collections; -import java.util.List; -import java.util.NoSuchElementException; -/** - * User: hanna - * Date: May 13, 2009 - * Time: 3:32:30 PM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** - * A LocusView over which the user can iterate. - */ - -public class AllLocusView extends LocusView { - private GenomeLocusIterator locusIterator; - - /** - * Gets the next position in the view: next call to next() will jump there. - * Note that both nextPosition and nextLocus are PRE-read and cached. - */ - private GenomeLoc nextPosition = null; - - /** - * What's the next available context? - */ - private AlignmentContext nextLocus = null; - - /** - * Signal not to advance the iterator because we're currently sitting at the next element. - */ - private boolean atNextElement = false; - - /** - * Create a new queue of locus contexts. - * - * @param provider - */ - public AllLocusView(LocusShardDataProvider provider) { - super(provider); - // Seed the state tracking members with the first possible seek position and the first possible locus context. - locusIterator = new GenomeLocusIterator(genomeLocParser, provider.getLocus()); - } - - public boolean hasNext() { - advance(); - return nextPosition != null; - } - - public AlignmentContext next() { - advance(); - - if (nextPosition == null) - throw new NoSuchElementException("No next is available in the all locus view"); - - // Flag to the iterator that no data is waiting in the queue to be processed. - atNextElement = false; - - AlignmentContext currentLocus; - - // If actual data is present, return it. Otherwise, return empty data. - if (nextLocus != null && nextLocus.getLocation().equals(nextPosition)) - currentLocus = nextLocus; - else - currentLocus = createEmptyLocus(nextPosition); - - return currentLocus; - } - - private void advance() { - // Already at the next element? Don't move forward. - if (atNextElement) - return; - - // Out of elements? - if (nextPosition == null && !locusIterator.hasNext()) - return; - - // If nextLocus has been consumed, clear it out to make room for the next incoming locus. - if (nextPosition != null && nextLocus != null && !nextLocus.getLocation().isPast(nextPosition)) { - nextLocus = null; - - // Determine the next locus. The trick is that we may have more than one alignment context at the same - // reference position (regular base pileup, then extended pileup). If next alignment context (that we just pre-read) - // is still at the current position, we do not increment current position and wait for next call to next() to return - // that context. If we know that next context is past the current position, we are done with current - // position - if (hasNextLocus()) { - nextLocus = nextLocus(); - if (nextPosition.equals(nextLocus.getLocation())) { - atNextElement = true; - return; - } - } - } - - // No elements left in queue? Clear out the position state tracker and return. - if (!locusIterator.hasNext()) { - nextPosition = null; - return; - } - - // Actually fill the next position. - nextPosition = locusIterator.next(); - atNextElement = true; - - // Crank the iterator to (if possible) or past the next context. Be careful not to hold a reference to nextLocus - // while using the hasNextLocus() / nextLocus() machinery; this will cause us to use more memory than is optimal. - while (nextLocus == null || nextLocus.getLocation().isBefore(nextPosition)) { - nextLocus = null; - if (!hasNextLocus()) - break; - nextLocus = nextLocus(); - } - } - - /** - * Creates a blank locus context at the specified location. - * - * @param site Site at which to create the blank locus context. - * @return empty context. - */ - private final static List EMPTY_PILEUP_READS = Collections.emptyList(); - private final static List EMPTY_PILEUP_OFFSETS = Collections.emptyList(); - private final static List EMPTY_DELETION_STATUS = Collections.emptyList(); - - private AlignmentContext createEmptyLocus(GenomeLoc site) { - return new AlignmentContext(site, new ReadBackedPileupImpl(site, EMPTY_PILEUP_READS, EMPTY_PILEUP_OFFSETS)); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/CoveredLocusView.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/CoveredLocusView.java deleted file mode 100644 index 900612a49..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/CoveredLocusView.java +++ /dev/null @@ -1,63 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -/** - * User: hanna - * Date: May 12, 2009 - * Time: 11:24:42 AM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** - * A queue of locus contexts. Provides unidirectional seek. Stripped down - * implementation of java.util.Queue interface. - */ - -public class CoveredLocusView extends LocusView { - /** - * Create a new queue of locus contexts. - * @param provider - */ - public CoveredLocusView(LocusShardDataProvider provider) { - super(provider); - } - - public boolean hasNext() { - return hasNextLocus(); - } - - public AlignmentContext next() { - return nextLocus(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalOverlappingRODsFromStream.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalOverlappingRODsFromStream.java deleted file mode 100644 index 9100905f3..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalOverlappingRODsFromStream.java +++ /dev/null @@ -1,168 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import htsjdk.samtools.util.PeekableIterator; -import org.broadinstitute.gatk.engine.refdata.RODRecordListImpl; -import org.broadinstitute.gatk.engine.refdata.utils.GATKFeature; -import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList; -import org.broadinstitute.gatk.utils.GenomeLoc; - -import java.util.Collection; -import java.util.LinkedList; -import java.util.ListIterator; - -/** - * Key algorithmic helper for ReadBasedReferenceOrderedData - * - * Takes a single iterator of features, and provides a single capability that returns - * the list of RODs that overlap an interval. Allows sequential getOverlapping calls - * from intervals provided that these intervals always have increasing getStart() values. - * - */ -class IntervalOverlappingRODsFromStream { - /** - * Only held for QC purposes - */ - GenomeLoc lastQuery = null; - - private final String name; - private final LinkedList currentFeatures = new LinkedList(); - private final PeekableIterator futureFeatures; - - /** - * Create a new IntervalOverlappingRODsFromStream that reads elements from futureFeatures and - * returns RODRecordLists having name - * - * @param name - * @param futureFeatures - */ - IntervalOverlappingRODsFromStream(final String name, final PeekableIterator futureFeatures) { - if ( futureFeatures == null ) throw new IllegalArgumentException("futureFeatures cannot be null"); - - this.name = name; - this.futureFeatures = futureFeatures; - } - - /** - * Get the list of RODs overlapping loc from this stream of RODs. - * - * @param loc the interval to query - * @return a non-null RODRecordList containing the overlapping RODs, which may be empty - */ - @Ensures({"overlaps(loc, result)", - "! futureFeatures.hasNext() || futureFeatures.peek().getLocation().isPast(loc)", - "result != null"}) - public RODRecordList getOverlapping(final GenomeLoc loc) { - if ( lastQuery != null && loc.getStart() < lastQuery.getStart() ) - throw new IllegalArgumentException(String.format("BUG: query interval (%s) starts before the previous interval %s", loc, lastQuery)); - - readOverlappingFutureFeatures(loc); - return new RODRecordListImpl(name, subsetToOverlapping(loc, currentFeatures), loc); - } - - - /** - * For contract assurance. Checks that all bindings in loc overlap - * - * @param loc - * @param bindings - * @return - */ - @Requires({"loc != null", "bindings != null"}) - private boolean overlaps(final GenomeLoc loc, final RODRecordList bindings) { - for ( final GATKFeature feature : bindings ) - if ( ! feature.getLocation().overlapsP(loc) ) - return false; - return true; - } - - /** - * Subset the features in all to those that overlap with loc - * - * The current features list contains everything read that cannot be thrown away yet, but not - * everything in there necessarily overlaps with loc. Subset to just those that do overlap - * - * @param loc the location that features must overlap - * @param all the list of all features - * @return a subset of all that overlaps with loc - */ - @Requires({"loc != null", "all != null"}) - @Ensures("result.size() <= all.size()") - private Collection subsetToOverlapping(final GenomeLoc loc, final Collection all) { - final LinkedList overlapping = new LinkedList(); - for ( final GATKFeature feature : all ) - if ( feature.getLocation().overlapsP(loc) ) - overlapping.add(feature); - return overlapping; - } - - /** - * Update function. Remove all elements of currentFeatures that end before loc - * - * Must be called by clients periodically when they know they they will never ask for data before - * loc, so that the running cache of RODs doesn't grow out of control. - * - * @param loc the location to use - */ - @Requires("loc != null") - @Ensures("currentFeatures.size() <= old(currentFeatures.size())") - public void trimCurrentFeaturesToLoc(final GenomeLoc loc) { - final ListIterator it = currentFeatures.listIterator(); - while ( it.hasNext() ) { - final GATKFeature feature = it.next(); - if ( feature.getLocation().isBefore(loc) ) - it.remove(); - } - } - - /** - * Update function: Read all elements from futureFeatures that overlap with loc - * - * Stops at the first element that starts before the end of loc, or the stream empties - * - * @param loc - */ - @Requires("loc != null") - @Ensures("currentFeatures.size() >= old(currentFeatures.size())") - private void readOverlappingFutureFeatures(final GenomeLoc loc) { - while ( futureFeatures.hasNext() ) { - final GenomeLoc nextLoc = futureFeatures.peek().getLocation(); - if ( nextLoc.isBefore(loc) ) { - futureFeatures.next(); // next rod element is before loc, throw it away and keep looking - } else if ( nextLoc.isPast(loc) ) { - break; // next element is past loc, stop looking but don't pop it - } else if ( nextLoc.overlapsP(loc) ) { - // add overlapping elements to our current features, removing from stream - for ( final GATKFeature feature : futureFeatures.next() ) { - currentFeatures.add(feature); - } - } - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalReferenceOrderedView.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalReferenceOrderedView.java deleted file mode 100644 index 23f4f73e8..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalReferenceOrderedView.java +++ /dev/null @@ -1,184 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import htsjdk.samtools.util.PeekableIterator; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.datasources.reads.ReadShard; -import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - -/** - * a ROD view that allows for requests for RODs that overlap intervals on the genome to produce a RefMetaDataTracker - */ -public class IntervalReferenceOrderedView implements ReferenceOrderedView { - /** a list of the RMDDataState (location->iterators) */ - private final List states = new ArrayList<>(1); - - /** - * Used to get genome locs for reads - */ - protected final GenomeLocParser genomeLocParser; - - /** - * The total extent of all reads in this span. We create iterators from our RODs - * from the start of this span, to the end. - */ - private final GenomeLoc shardSpan; - - /** - * Create a new IntervalReferenceOrderedView taking data from provider and capable of - * servicing ROD overlap requests within the genomic interval span - * - * @param provider a ShardDataProvider to give us data - * @param span a GenomeLoc span, or null indicating take the entire genome - */ - public IntervalReferenceOrderedView(final ShardDataProvider provider, final GenomeLoc span) { - if ( provider == null ) throw new IllegalArgumentException("provider cannot be null"); - if ( provider.hasReferenceOrderedData() && span == null ) throw new IllegalArgumentException("span cannot be null when provider has reference ordered data"); - - this.genomeLocParser = provider.getGenomeLocParser(); - this.shardSpan = span; - provider.register(this); - - // conditional to optimize the case where we don't have any ROD data - if ( provider.hasReferenceOrderedData() && ! shardSpan.isUnmapped() ) { - for (final ReferenceOrderedDataSource dataSource : provider.getReferenceOrderedData()) - states.add(new RMDDataState(dataSource, dataSource.seek(shardSpan))); - } - } - - /** - * Testing constructor - */ - protected IntervalReferenceOrderedView(final GenomeLocParser genomeLocParser, - final GenomeLoc shardSpan, - final List names, - final List> featureSources) { - this.genomeLocParser = genomeLocParser; - this.shardSpan = shardSpan; - for ( int i = 0; i < names.size(); i++ ) - states.add(new RMDDataState(names.get(i), featureSources.get(i))); - } - - public Collection> getConflictingViews() { - List> classes = new ArrayList<>(); - classes.add(ManagingReferenceOrderedView.class); - return classes; - } - - /** - * Get a RefMetaDataTracker containing bindings for all RODs overlapping the start position of loc - * @param loc a GenomeLoc of size == 1 - * @return a non-null RefMetaDataTracker - */ - @Override - public RefMetaDataTracker getReferenceOrderedDataAtLocus(GenomeLoc loc) { - if ( loc == null ) throw new IllegalArgumentException("loc cannot be null"); - if ( loc.size() != 1 ) throw new IllegalArgumentException("GenomeLoc must have size == 1 but got " + loc); - return getReferenceOrderedDataForInterval(loc); - } - - /** - * Get a RefMetaDataTracker containing bindings for all RODs overlapping interval - * - * @param interval a non=null interval - * @return a non-null RefMetaDataTracker - */ - public RefMetaDataTracker getReferenceOrderedDataForInterval(final GenomeLoc interval) { - if ( interval == null ) throw new IllegalArgumentException("Interval cannot be null"); - - if ( states.isEmpty() || shardSpan.isUnmapped() ) // optimization for no bindings (common for read walkers) - return RefMetaDataTracker.EMPTY_TRACKER; - else { - final List bindings = new ArrayList<>(states.size()); - for ( final RMDDataState state : states ) - bindings.add(state.stream.getOverlapping(interval)); - return new RefMetaDataTracker(bindings); - } - } - - /** - * Trim down all of the ROD managers so that they only hold ROD bindings wit start >= startOfDataToKeep.getStart() - * - * @param startOfDataToKeep a non-null genome loc - */ - public void trimCurrentFeaturesToLoc(final GenomeLoc startOfDataToKeep) { - if ( startOfDataToKeep == null ) throw new IllegalArgumentException("startOfDataToKeep cannot be null"); - - for ( final RMDDataState state : states ) - state.stream.trimCurrentFeaturesToLoc(startOfDataToKeep); - } - - /** - * Closes the current view. - */ - public void close() { - for (final RMDDataState state : states) - state.close(); - - // Clear out the existing data so that post-close() accesses to this data will fail-fast. - states.clear(); - } - - /** - * Models the traversal state of a given ROD lane. - */ - private static class RMDDataState { - public final ReferenceOrderedDataSource dataSource; - public final IntervalOverlappingRODsFromStream stream; - private final LocationAwareSeekableRODIterator iterator; - - public RMDDataState(ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator) { - this.dataSource = dataSource; - this.iterator = iterator; - this.stream = new IntervalOverlappingRODsFromStream(dataSource.getName(), new PeekableIterator<>(iterator)); - } - - /** - * For testing - */ - public RMDDataState(final String name, final PeekableIterator iterator) { - this.dataSource = null; - this.iterator = null; - this.stream = new IntervalOverlappingRODsFromStream(name, new PeekableIterator<>(iterator)); - } - - public void close() { - if ( dataSource != null ) - dataSource.close( iterator ); - } - } -} - diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceView.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceView.java deleted file mode 100644 index b53505097..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceView.java +++ /dev/null @@ -1,236 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import htsjdk.samtools.reference.ReferenceSequence; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.walkers.Reference; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.engine.walkers.Window; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Provides access to the portion of the reference covering a single locus. - */ -public class LocusReferenceView extends ReferenceView { - /** - * Bound the reference view to make sure all accesses are within the shard. - */ - private GenomeLoc bounds; - - /** - * Start of the expanded window for which the reference context should be provided, - * relative to the locus in question. - */ - private final int windowStart; - - - /** - * Start of the expanded window for which the reference context should be provided, - * relative to the locus in question. - */ - private final int windowStop; - - /** - * Track the reference sequence and the last point accessed. Used to - * track state when traversing over the reference. - */ - private ReferenceSequence referenceSequence; - - /** - * Create a LocusReferenceView given no other contextual information about - * the walkers, etc. - * @param provider source for locus data. - */ - public LocusReferenceView( LocusShardDataProvider provider ) { - super(provider); - initializeBounds(provider); - windowStart = windowStop = 0; - initializeReferenceSequence(bounds); - } - - /** - * Create a new locus reference view. - * @param provider source for locus data. - */ - public LocusReferenceView( Walker walker, LocusShardDataProvider provider ) { - super( provider ); - initializeBounds(provider); - - // Retrieve information about the window being accessed. - if( walker.getClass().isAnnotationPresent(Reference.class) ) { - Window window = walker.getClass().getAnnotation(Reference.class).window(); - - if( window.start() > 0 ) throw new ReviewedGATKException( "Reference window starts after current locus" ); - if( window.stop() < 0 ) throw new ReviewedGATKException( "Reference window ends before current locus" ); - - windowStart = window.start(); - windowStop = window.stop(); - } - else { - windowStart = 0; - windowStop = 0; - } - - if(bounds != null) { - int expandedStart = getWindowStart( bounds ); - int expandedStop = getWindowStop( bounds ); - initializeReferenceSequence(genomeLocParser.createGenomeLoc(bounds.getContig(), bounds.getContigIndex(), expandedStart, expandedStop)); - } - } - - /** - * Initialize the bounds of this shard, trimming the bounds so that they match the reference. - * @param provider Provider covering the appropriate locus. - */ - private void initializeBounds(LocusShardDataProvider provider) { - if(provider.getLocus() != null) { - int sequenceLength = reference.getSequenceDictionary().getSequence(provider.getLocus().getContig()).getSequenceLength(); - bounds = genomeLocParser.createGenomeLoc(provider.getLocus().getContig(), - Math.max(provider.getLocus().getStart(),1), - Math.min(provider.getLocus().getStop(),sequenceLength)); - } - else - bounds = null; - } - - /** - * Initialize reference sequence data using the given locus. - * @param locus - */ - private void initializeReferenceSequence( GenomeLoc locus ) { - this.referenceSequence = reference.getSubsequenceAt( locus.getContig(), locus.getStart(), locus.getStop() ); - } - - protected GenomeLoc trimToBounds(GenomeLoc l) { - int expandedStart = getWindowStart( bounds ); - int expandedStop = getWindowStop( bounds ); - if ( l.getStart() < expandedStart ) l = genomeLocParser.setStart(l, expandedStart); - if ( l.getStop() > expandedStop ) l = genomeLocParser.setStop(l, expandedStop); - return l; - } - - public class Provider implements ReferenceContext.ReferenceContextRefProvider { - int refStart, len; - - public Provider( int refStart, int len ) { - this.refStart = refStart; - this.len = len; - } - - public byte[] getBases() { - //System.out.printf("Getting bases for location%n"); - byte[] bases = new byte[len]; - System.arraycopy(referenceSequence.getBases(), refStart, bases, 0, len); - return bases; - } - } - - /** - * Gets the reference context associated with this particular point or extended interval on the genome. - * @param genomeLoc Region for which to retrieve the base(s). If region spans beyond contig end or beyond current bounds, it will be trimmed down. - * @return The base at the position represented by this genomeLoc. - */ - public ReferenceContext getReferenceContext( GenomeLoc genomeLoc ) { - //validateLocation( genomeLoc ); - - GenomeLoc window = genomeLocParser.createGenomeLoc( genomeLoc.getContig(), genomeLoc.getContigIndex(), - getWindowStart(genomeLoc), getWindowStop(genomeLoc) ); - - int refStart = -1; - if (bounds != null) { - window = trimToBounds(window); - refStart = (int)(window.getStart() - getWindowStart(bounds)); - } - else { - if(referenceSequence == null || referenceSequence.getContigIndex() != genomeLoc.getContigIndex()) - referenceSequence = reference.getSequence(genomeLoc.getContig()); - refStart = (int)window.getStart()-1; - } - - int len = (int)window.size(); - return new ReferenceContext( genomeLocParser, genomeLoc, window, new Provider(refStart, len)); - } - - /** - * Allow the user to pull reference info from any arbitrary region of the reference. - * @param genomeLoc The locus. - * @return A list of the bases starting at the start of the locus (inclusive) and ending - * at the end of the locus (inclusive). - */ - public byte[] getReferenceBases( GenomeLoc genomeLoc ) { - return super.getReferenceBases(genomeLoc); - } - - /** - * Gets the start of the expanded window, bounded if necessary by the contig. - * @param locus The locus to expand. - * @return The expanded window. - */ - private int getWindowStart( GenomeLoc locus ) { - // If the locus is not within the bounds of the contig it allegedly maps to, expand only as much as we can. - if(locus.getStart() < 1) return 1; -// if(locus.getStart() < 1) return locus.getStart(); - return Math.max( locus.getStart() + windowStart, 1 ); - } - - /** - * Gets the stop of the expanded window, bounded if necessary by the contig. - * @param locus The locus to expand. - * @return The expanded window. - */ - private int getWindowStop( GenomeLoc locus ) { - // If the locus is not within the bounds of the contig it allegedly maps to, expand only as much as we can. - int sequenceLength = reference.getSequenceDictionary().getSequence(locus.getContig()).getSequenceLength(); - if(locus.getStop() > sequenceLength) return sequenceLength; - return Math.min( locus.getStop() + windowStop, sequenceLength ); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusView.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusView.java deleted file mode 100644 index 9bc37e549..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusView.java +++ /dev/null @@ -1,220 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; -import org.broadinstitute.gatk.engine.ReadProperties; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.utils.locusiterator.LocusIterator; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState; - -import java.util.Arrays; -import java.util.Collection; -import java.util.NoSuchElementException; - -/** - * User: hanna - * Date: May 13, 2009 - * Time: 3:30:16 PM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** - * The two goals of the LocusView are as follows: - * 1) To provide a 'trigger track' iteration interface so that TraverseLoci can easily switch - * between iterating over all bases in a region, only covered bases in a region covered by - * reads, only bases in a region covered by RODs, or any other sort of trigger track - * implementation one can think of. - * 2) To manage the copious number of iterators that have to be jointly pulled through the - * genome to make a locus traversal function. - */ -public abstract class LocusView extends LocusIterator implements View { - /** - * The locus bounding this view. - */ - protected GenomeLoc locus; - - /** - * The GenomeLocParser, used to create new genome locs. - */ - protected GenomeLocParser genomeLocParser; - - /** - * Source info for this view. Informs the class about downsampling requirements. - */ - private ReadProperties sourceInfo; - - /** - * The actual locus context iterator. - */ - private LocusIterator loci; - - /** - * The next locus context from the iterator. Lazy loaded: if nextLocus is null and advance() doesn't - * populate it, the iterator is exhausted. If populated, this is the value that should be returned by - * next(). - */ - private AlignmentContext nextLocus = null; - - public LocusView(LocusShardDataProvider provider) { - this.locus = provider.getLocus(); - - this.sourceInfo = provider.getSourceInfo(); - this.genomeLocParser = provider.getGenomeLocParser(); - this.loci = provider.getLocusIterator(); - - advance(); - - provider.register(this); - } - - /** - * Only one view of the locus is supported at any given time. - * @return A list consisting of all other locus views. - */ - public Collection> getConflictingViews() { - return Arrays.>asList(LocusView.class,ReadView.class); - } - - /** - * Close this view. - */ - public void close() { - // Set everything to null with the hope of failing fast. - locus = null; - sourceInfo = null; - loci = null; - - super.close(); - } - - /** - * Is there another covered locus context bounded by this view. - * @return True if another covered locus context exists. False otherwise. - */ - public abstract boolean hasNext(); - - /** - * Returns the next covered locus context in the shard. - * @return Next covered locus context in the shard. - * @throw NoSuchElementException if no such element exists. - */ - public abstract AlignmentContext next(); - - /** - * Unsupported. - * @throw UnsupportedOperationException always. - */ - public void remove() { - throw new UnsupportedOperationException("Unable to remove elements from this queue."); - } - - /** - * Is there another locus context bounded by this shard. - * @return True if another locus context is bounded by this shard. - */ - protected boolean hasNextLocus() { - advance(); - return nextLocus != null; - } - - /** - * Get the next locus context bounded by this shard. - * @return Next locus context bounded by this shard. - * @throw NoSuchElementException if the next element is missing. - */ - protected AlignmentContext nextLocus() { - advance(); - if(nextLocus == null) - throw new NoSuchElementException("No more elements remain in locus context queue."); - - // Cache the current and apply filtering. - AlignmentContext current = nextLocus; - - // Indicate that the next operation will need to advance. - nextLocus = null; - - return current; - } - - /** - * Seed the nextLocus variable with the contents of the next locus (if one exists). - */ - private void advance() { - // Already an unclaimed locus present - if(nextLocus != null) - return; - - //System.out.printf("loci is %s%n", loci); - if( !loci.hasNext() ) { - nextLocus = null; - return; - } - - nextLocus = loci.next(); - - // If the location of this shard is available, trim the data stream to match the shard. - // TODO: Much of this functionality is being replaced by the WindowMaker. - if(locus != null) { - // Iterate through any elements not contained within this shard. - while( nextLocus != null && !isContainedInShard(nextLocus.getLocation()) && loci.hasNext() ) - nextLocus = loci.next(); - - // If nothing in the shard was found, indicate that by setting nextLocus to null. - if( nextLocus != null && !isContainedInShard(nextLocus.getLocation()) ) - nextLocus = null; - } - } - - /** - * Is this location contained in the given shard. - * @param location Location to check. - * @return True if the given location is contained within the shard. False otherwise. - */ - private boolean isContainedInShard(GenomeLoc location) { - return locus.containsP(location); - } - - /** - * {@inheritDoc} - * - * Since this class has an actual LIBS, so this function will never throw an exception - * - * @return the LocusIteratorByState used by this view to get pileups - */ - @Override - public LocusIteratorByState getLIBS() { - return loci.getLIBS(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ManagingReferenceOrderedView.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ManagingReferenceOrderedView.java deleted file mode 100644 index 2dd42c1cc..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ManagingReferenceOrderedView.java +++ /dev/null @@ -1,117 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList; -import org.broadinstitute.gatk.utils.GenomeLoc; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -/** - * User: hanna - * Date: May 21, 2009 - * Time: 2:49:17 PM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** - * A view into the reference-ordered data in the provider. - */ -public class ManagingReferenceOrderedView implements ReferenceOrderedView { - /** - * The data sources along with their current states. - */ - private List states = new ArrayList(); - - /** - * Create a new view of reference-ordered data. - * @param provider - */ - public ManagingReferenceOrderedView( LocusShardDataProvider provider ) { - for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() ) - states.add(new ReferenceOrderedDataState(dataSource, dataSource.seek(provider.getLocus()))); - - provider.register(this); - } - - public Collection> getConflictingViews() { return Collections.emptyList(); } - - /** - * Gets an object which can track the reference-ordered data at every locus. - * @param loc Locus at which to track. - * @return A tracker containing information about this locus. - */ - @Override - public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ) { - if ( states.isEmpty() ) - return RefMetaDataTracker.EMPTY_TRACKER; - else { - List bindings = new ArrayList(states.size()); - - for ( ReferenceOrderedDataState state: states ) - // todo -- warning, I removed the reference to the name from states - bindings.add( state.iterator.seekForward(loc) ); - - return new RefMetaDataTracker(bindings); - } - } - - /** - * Closes the current view. - */ - public void close() { - for( ReferenceOrderedDataState state: states ) - state.dataSource.close( state.iterator ); - - // Clear out the existing data so that post-close() accesses to this data will fail-fast. - states = null; - } -} - -/** - * Models the traversal state of a given ROD lane. - */ -class ReferenceOrderedDataState { - public final ReferenceOrderedDataSource dataSource; - public final LocationAwareSeekableRODIterator iterator; - - public ReferenceOrderedDataState( ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator ) { - this.dataSource = dataSource; - this.iterator = iterator; - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RODMetaDataContainer.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RODMetaDataContainer.java deleted file mode 100644 index f244e504d..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RODMetaDataContainer.java +++ /dev/null @@ -1,83 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import org.broadinstitute.gatk.engine.refdata.utils.GATKFeature; -import org.broadinstitute.gatk.utils.collections.Pair; - -import java.util.*; - - -/** - * - * @author aaron - * - * Class RODMetaDataContainer - * - * stores both the name and the class for each ROD. This class assumes that: - * - * -Names must be unique - * -Classes are allowed to have duplicates - * - * This class encapsulates the ref data associations, and provides lookup by name and by - * class type. - * - */ -public class RODMetaDataContainer { - // we only allow non-duplicate ROD names, a HashMap is fine - private final HashMap nameMap = new HashMap(); - - // we do allow duplicate class entries, so we need to store pairs of data - private final List> classMap = new ArrayList>(); - - public void addEntry(GATKFeature data) { - nameMap.put(data.getName(),data); - classMap.add(new Pair(data.getClass(),data)); - } - - public Collection getSet(String name) { - if (name == null) return getSet(); - Set set = new HashSet(); - if (nameMap.containsKey(name)) set.add(nameMap.get(name)); - return set; - } - - /** - * get the feature contents of this container; the unfiltered set without their name association - * @return - */ - public Collection getSet() { - return new ArrayList(nameMap.values()); - } - - // the brute force (n) search ended up being faster than sorting and binary search in all but the most extreme cases (thousands of RODs at a location). - public Collection getSet(Class cls) { - Collection ret = new ArrayList(); - for (Pair pair: classMap) - if (pair.first.equals(cls)) ret.add(pair.second); - return ret; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadBasedReferenceOrderedView.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadBasedReferenceOrderedView.java deleted file mode 100644 index 1d73501bd..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadBasedReferenceOrderedView.java +++ /dev/null @@ -1,69 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import htsjdk.samtools.util.PeekableIterator; -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.datasources.reads.ReadShard; -import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - -/** a ROD view for reads. This provides the Read traversals a way of getting a RefMetaDataTracker */ -public class ReadBasedReferenceOrderedView extends IntervalReferenceOrderedView { - public ReadBasedReferenceOrderedView(final ShardDataProvider provider) { - super(provider, provider.hasReferenceOrderedData() ? ((ReadShard)provider.getShard()).getReadsSpan() : null); - } - - /** - * create a RefMetaDataTracker given the current read - * - * @param rec the read - * - * @return a RefMetaDataTracker for the read, from which you can get ROD -> read alignments - */ - @Requires("rec != null") - @Ensures("result != null") - public RefMetaDataTracker getReferenceOrderedDataForRead(final SAMRecord rec) { - if ( rec.getReadUnmappedFlag() ) - return RefMetaDataTracker.EMPTY_TRACKER; - else { - final GenomeLoc readSpan = genomeLocParser.createGenomeLoc(rec); - trimCurrentFeaturesToLoc(readSpan); - return getReferenceOrderedDataForInterval(readSpan); - } - } -} - diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadReferenceView.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadReferenceView.java deleted file mode 100644 index 14d5827a3..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadReferenceView.java +++ /dev/null @@ -1,102 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.utils.GenomeLoc; -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * User: hanna - * Date: May 22, 2009 - * Time: 12:36:14 PM - * - */ - -/** Provides access to the reference over a single read. */ - -public class ReadReferenceView extends ReferenceView { - /** - * Create a view of the reference with respect to a single read. - * - * @param provider - */ - public ReadReferenceView( ShardDataProvider provider ) { - super(provider); - } - - protected ReferenceContext.ReferenceContextRefProvider getReferenceBasesProvider( GenomeLoc genomeLoc ) { - return new Provider(genomeLoc); - } - - public class Provider implements ReferenceContext.ReferenceContextRefProvider { - GenomeLoc loc; - - public Provider( GenomeLoc loc ) { - this.loc = loc; - } - - public byte[] getBases() { - return getReferenceBases(loc); - } - } - - /** - * Return a reference context appropriate for the span of read - * - * @param read the mapped read to test - * @return - */ - public ReferenceContext getReferenceContext( final SAMRecord read ) { - GenomeLoc loc = genomeLocParser.createGenomeLoc(read); - return new ReferenceContext( genomeLocParser, loc, loc, getReferenceBasesProvider(loc) ); - } - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadShardDataProvider.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadShardDataProvider.java deleted file mode 100644 index 8acfad0b1..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadShardDataProvider.java +++ /dev/null @@ -1,82 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import org.broadinstitute.gatk.engine.datasources.reads.Shard; -import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; -import org.broadinstitute.gatk.utils.GenomeLocParser; - -import java.util.Collection; - -/** - * Present data sharded by read to a traversal engine. - * - * @author mhanna - * @version 0.1 - */ -public class ReadShardDataProvider extends ShardDataProvider { - /** - * The raw collection of reads. - */ - private final GATKSAMIterator reads; - - /** - * Create a data provider for the shard given the reads and reference. - * @param shard The chunk of data over which traversals happen. - * @param reference A getter for a section of the reference. - */ - public ReadShardDataProvider(Shard shard, GenomeLocParser genomeLocParser, GATKSAMIterator reads, IndexedFastaSequenceFile reference, Collection rods) { - super(shard,genomeLocParser,reference,rods); - this.reads = reads; - } - - /** - * Can this data source provide reads? - * @return True if reads are available, false otherwise. - */ - public boolean hasReads() { - return reads != null; - } - - /** - * Gets an iterator over all the reads bound by this shard. - * @return An iterator over all reads in this shard. - */ - public GATKSAMIterator getReadIterator() { - return reads; - } - - @Override - public void close() { - super.close(); - - if(reads != null) - reads.close(); - } - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadView.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadView.java deleted file mode 100644 index 160dbd585..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadView.java +++ /dev/null @@ -1,88 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; - -import java.util.Arrays; -import java.util.Collection; -/** - * User: hanna - * Date: May 22, 2009 - * Time: 12:06:54 PM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** - * A view into the reads that a provider can provide. - */ -public class ReadView implements View, Iterable { - /** - * The iterator into the reads supplied by this provider. - */ - private GATKSAMIterator reads; - - /** - * Create a new view of the reads given the current data set. - * @param provider Source for the data. - */ - public ReadView( ReadShardDataProvider provider ) { - reads = provider.getReadIterator(); - } - - /** - * Other reads and loci conflict with this view. - * @return Array of reads and loci. - */ - public Collection> getConflictingViews() { - return Arrays.>asList(ReadView.class, LocusView.class); - } - - /** - * Close the view over these reads. Note that this method closes just - * the view into the reads, not the reads themselves. - */ - public void close() { - // Don't close the reads. The provider is responsible for this. - // Just dispose of the pointer. - reads = null; - } - - /** - * Gets an iterator into the reads supplied by this provider. - * @return Iterator into the reads that this provider covers. - */ - public GATKSAMIterator iterator() { - return reads; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedView.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedView.java deleted file mode 100644 index 9f3db5143..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedView.java +++ /dev/null @@ -1,33 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.utils.GenomeLoc; - -public interface ReferenceOrderedView extends View { - RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ); -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RodLocusView.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RodLocusView.java deleted file mode 100644 index 21cb3efa6..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RodLocusView.java +++ /dev/null @@ -1,197 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.collections.RODMergingIterator; -import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; - -import java.util.*; - -/** - * A view into the reference-ordered data in the provider. - */ -public class RodLocusView extends LocusView implements ReferenceOrderedView { - /** - * The data sources along with their current states. - */ - private RODMergingIterator rodQueue = null; - - Collection allTracksHere; - - GenomeLoc lastLoc = null; - RODRecordList interval = null; - - /** - * The data sources along with their current states. - */ - private List states = new ArrayList(); - - /** - * Enable debugging output -- todo remove me - */ - final static boolean DEBUG = false; - - final static String INTERVAL_ROD_NAME = "interval"; - - /** - * Create a new view of reference-ordered data. - * - * @param provider - */ - public RodLocusView( LocusShardDataProvider provider ) { - super(provider); - - GenomeLoc loc = provider.getLocus(); - - List< Iterator > iterators = new LinkedList< Iterator >(); - for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() ) { - if ( DEBUG ) System.out.printf("Shard is %s%n", provider.getLocus()); - - // grab the ROD iterator from the data source, and compute the first location in this shard, forwarding - // the iterator to immediately before it, so that it can be added to the merging iterator primed for - // next() to return the first real ROD in this shard - LocationAwareSeekableRODIterator it = dataSource.seek(provider.getLocus()); - it.seekForward(genomeLocParser.createGenomeLoc(loc.getContig(), loc.getStart()-1)); - - states.add(new ReferenceOrderedDataState(dataSource,it)); - - // we need to special case the interval so we don't always think there's a rod at the first location - if ( dataSource.getName().equals(INTERVAL_ROD_NAME) ) { - if ( interval != null ) - throw new RuntimeException("BUG: interval local variable already assigned " + interval); - interval = it.next(); - } else { - iterators.add( it ); - } - } - - rodQueue = new RODMergingIterator(iterators); - } - - @Override - public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ) { - // special case the interval again -- add it into the ROD - if ( interval != null ) { allTracksHere.add(interval); } - return new RefMetaDataTracker(allTracksHere); - } - - public boolean hasNext() { - if ( ! rodQueue.hasNext() ) - return false; - else { - return ! rodQueue.peekLocation().isPast(locus); - } - } - - /** - * Returns the next covered locus context in the shard. - * @return Next covered locus context in the shard. - * @throw NoSuchElementException if no such element exists. - */ - public AlignmentContext next() { - if ( DEBUG ) System.out.printf("In RodLocusView.next()...%n"); - RODRecordList datum = rodQueue.next(); - if ( DEBUG ) System.out.printf("In RodLocusView.next(); datum = %s...%n", datum.getLocation()); - - if ( DEBUG ) System.out.printf("In RodLocusView.next(): creating tracker...%n"); - - allTracksHere = getSpanningTracks(datum); - GenomeLoc rodSite = datum.getLocation(); - GenomeLoc site = genomeLocParser.createGenomeLoc( rodSite.getContig(), rodSite.getStart(), rodSite.getStart()); - - if ( DEBUG ) System.out.printf("rodLocusView.next() is at %s%n", site); - - // calculate the number of skipped bases, and update lastLoc so we can do that again in the next() - long skippedBases = getSkippedBases( rodSite ); - lastLoc = site; - return new AlignmentContext(site, new ReadBackedPileupImpl(site), skippedBases); - } - - private Collection getSpanningTracks(RODRecordList marker) { - return rodQueue.allElementsLTE(marker); - } - - /** - * Returns the number of reference bases that have been skipped: - * - * 1 -- since the last processed location if we have one - * 2 -- from the beginning of the shard if this is the first loc - * 3 -- from the last location to the current position - * - * @param currentPos - * @return - */ - private long getSkippedBases( GenomeLoc currentPos ) { - // the minus - is because if lastLoc == null, you haven't yet seen anything in this interval, so it should also be counted as skipped - Integer compStop = lastLoc == null ? locus.getStart() - 1 : lastLoc.getStop(); - long skippedBases = currentPos.getStart() - compStop - 1; - - if ( skippedBases < -1 ) { // minus 1 value is ok - throw new RuntimeException(String.format("BUG: skipped bases=%d is < 0: cur=%s vs. last=%s, shard=%s", - skippedBases, currentPos, lastLoc, locus)); - } - return Math.max(skippedBases, 0); - } - - /** - * Get the location one after the last position we will traverse through - * @return - */ - public GenomeLoc getLocOneBeyondShard() { - return genomeLocParser.createGenomeLoc(locus.getContig(),locus.getStop()+1); - } - - /** - * How many bases are we skipping from the current location to the end of the interval / shard - * if we have no more elements - * - * @return - */ - public long getLastSkippedBases() { - if ( hasNext() ) - throw new RuntimeException("BUG: getLastSkippedBases called when there are elements remaining."); - - return getSkippedBases(getLocOneBeyondShard()); - } - - /** - * Closes the current view. - */ - public void close() { - for( ReferenceOrderedDataState state: states ) - state.dataSource.close( state.iterator ); - - rodQueue = null; - allTracksHere = null; - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMAccessPlan.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMAccessPlan.java deleted file mode 100644 index 1e30d6c38..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMAccessPlan.java +++ /dev/null @@ -1,170 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import htsjdk.samtools.util.PeekableIterator; -import htsjdk.samtools.GATKBAMFileSpan; -import htsjdk.samtools.GATKChunk; -import htsjdk.samtools.util.BlockCompressedFilePointerUtil; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.util.LinkedList; -import java.util.List; - -/** -* Created by IntelliJ IDEA. -* User: mhanna -* Date: 10/14/11 -* Time: 10:47 PM -* To change this template use File | Settings | File Templates. -*/ -class BAMAccessPlan { - private final SAMReaderID reader; - private final BlockInputStream inputStream; - - private final List positions; - private PeekableIterator positionIterator; - - /** - * Stores the next block address to read, or -1 if no such block is available. - */ - private long nextBlockAddress; - - - BAMAccessPlan(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) { - this.reader = reader; - this.inputStream = inputStream; - - this.positions = fileSpan.getGATKChunks(); - initialize(); - } - - public SAMReaderID getReader() { - return reader; - } - - public BlockInputStream getInputStream() { - return inputStream; - } - - /** - * Retrieves the next block address to be read. - * @return Next block address to be read. - */ - public long getBlockAddress() { - return nextBlockAddress; - } - - /** - * Retrieves the first offset of interest in the block returned by getBlockAddress(). - * @return First block of interest in this segment. - */ - public int getFirstOffsetInBlock() { - return (nextBlockAddress == positionIterator.peek().getBlockStart()) ? positionIterator.peek().getBlockOffsetStart() : 0; - } - - /** - * Gets the spans overlapping the given block; used to copy the contents of the block into the circular buffer. - * @param blockAddress Block address for which to search. - * @param filePosition Block address at which to terminate the last chunk if the last chunk goes beyond this span. - * @return list of chunks containing that block. - */ - public List getSpansOverlappingBlock(long blockAddress, long filePosition) { - List spansOverlapping = new LinkedList(); - // While the position iterator overlaps the given block, pull out spans to report. - while(positionIterator.hasNext() && positionIterator.peek().getBlockStart() <= blockAddress) { - // Create a span over as much of the block as is covered by this chunk. - int blockOffsetStart = (blockAddress == positionIterator.peek().getBlockStart()) ? positionIterator.peek().getBlockOffsetStart() : 0; - - // Calculate the end of this span. If the span extends past this block, cap it using the current file position. - long blockEnd; - int blockOffsetEnd; - if(blockAddress < positionIterator.peek().getBlockEnd()) { - blockEnd = filePosition; - blockOffsetEnd = 0; - } - else { - blockEnd = positionIterator.peek().getBlockEnd(); - blockOffsetEnd = positionIterator.peek().getBlockOffsetEnd(); - } - - GATKChunk newChunk = new GATKChunk(blockAddress,blockOffsetStart,blockEnd,blockOffsetEnd); - - if(newChunk.getChunkStart() <= newChunk.getChunkEnd()) - spansOverlapping.add(new GATKChunk(blockAddress,blockOffsetStart,blockEnd,blockOffsetEnd)); - - // If the value currently stored in the position iterator ends past the current block, we must be done. Abort. - if(!positionIterator.hasNext() || positionIterator.peek().getBlockEnd() > blockAddress) - break; - - // If the position iterator ends before the block ends, pull the position iterator forward. - if(positionIterator.peek().getBlockEnd() <= blockAddress) - positionIterator.next(); - } - - return spansOverlapping; - } - - public void reset() { - initialize(); - } - - /** - * Resets the SAM reader position to its original state. - */ - private void initialize() { - this.positionIterator = new PeekableIterator(positions.iterator()); - if(positionIterator.hasNext()) - nextBlockAddress = positionIterator.peek().getBlockStart(); - else - nextBlockAddress = -1; - } - - /** - * Advances the current position to the next block to read, given the current position in the file. - * @param filePosition The current position within the file. - */ - void advancePosition(final long filePosition) { - nextBlockAddress = BlockCompressedFilePointerUtil.getBlockAddress(filePosition); - - // Check the current file position against the iterator; if the iterator is before the current file position, - // draw the iterator forward. Remember when performing the check that coordinates are half-open! - while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek())) - positionIterator.next(); - - // If the block iterator has shot past the file pointer, bring the file pointer flush with the start of the current block. - if(positionIterator.hasNext() && filePosition < positionIterator.peek().getChunkStart()) - nextBlockAddress = positionIterator.peek().getBlockStart(); - - // If we've shot off the end of the block pointer, notify consumers that iteration is complete. - if(!positionIterator.hasNext()) - nextBlockAddress = -1; - } - - private boolean isFilePositionPastEndOfChunk(final long filePosition, final GATKChunk chunk) { - return filePosition >= chunk.getChunkEnd(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMSchedule.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMSchedule.java deleted file mode 100644 index a80b0a475..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMSchedule.java +++ /dev/null @@ -1,530 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import htsjdk.samtools.util.PeekableIterator; -import htsjdk.samtools.Bin; -import htsjdk.samtools.GATKBAMFileSpan; -import htsjdk.samtools.GATKChunk; -import htsjdk.samtools.util.CloseableIterator; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.GATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.io.File; -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.channels.FileChannel; -import java.util.*; - -/** - * Writes schedules for a single BAM file to a target output file. - */ -public class BAMSchedule implements CloseableIterator { - /** - * File in which to store schedule data. - */ - private File scheduleFile; - - /** - * File channel for the schedule file. - */ - private FileChannel scheduleFileChannel; - - /** - * The definitive, sorted list of reader IDs. Order is important here: the order - * in which the reader IDs are presented here maps to the order in which they appear in the file. - */ - private final List readerIDs = new ArrayList(); - - /** - * Iterators over the schedule. Stored in the same order as readerIDs, above. - */ - private final List> scheduleIterators = new ArrayList>(); - - /** - * Next schedule entry to be returned. Null if no additional entries are present. - */ - private BAMScheduleEntry nextScheduleEntry; - - /** - * Reference sequence for which to write the schedule. - */ - private final int referenceSequence; - - /** - * Sizes of ints and longs in bytes. - */ - private static final int INT_SIZE_IN_BYTES = Integer.SIZE / 8; - private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8; - - /** - * Create a new BAM schedule based on the given index. - * @param dataSource The SAM data source to use. - * @param intervals List of - */ - public BAMSchedule(final SAMDataSource dataSource, final List intervals) { - if(intervals.isEmpty()) - throw new ReviewedGATKException("Tried to write schedule for empty interval list."); - - referenceSequence = dataSource.getHeader().getSequence(intervals.get(0).getContig()).getSequenceIndex(); - - createScheduleFile(); - - readerIDs.addAll(dataSource.getReaderIDs()); - - for(final SAMReaderID reader: readerIDs) { - final GATKBAMIndex index = dataSource.getIndex(reader); - final GATKBAMIndexData indexData = index.readReferenceSequence(referenceSequence); - - int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1); - Iterator locusIterator = intervals.iterator(); - GenomeLoc currentLocus = locusIterator.next(); - - final long readerStartOffset = position(); - - int maxChunkCount = 0; - - while(currentBinInLowestLevel < GATKBAMIndex.MAX_BINS && currentLocus != null) { - final Bin bin = new Bin(referenceSequence,currentBinInLowestLevel); - final int binStart = index.getFirstLocusInBin(bin); - final int binStop = index.getLastLocusInBin(bin); - - // In required, pull bin iterator ahead to the point of the next GenomeLoc. - if(binStop < currentLocus.getStart()) { - currentBinInLowestLevel++; - continue; - } - - // At this point, the bin stop is guaranteed to be >= the start of the locus. - // If the bins have gone past the current locus, update the current locus if at all possible. - if(binStart > currentLocus.getStop()) { - currentLocus = locusIterator.hasNext() ? locusIterator.next() : null; - continue; - } - - // Code at this point knows that the current bin is neither before nor after the current locus, - // so it must overlap. Add this region to the filesystem. - final GATKBAMFileSpan fileSpan = indexData.getSpanOverlapping(bin); - - if(!fileSpan.isEmpty()) { - // File format is binary in little endian; start of region, end of region, num chunks, then the chunks themselves. - ByteBuffer buffer = allocateByteBuffer(2*INT_SIZE_IN_BYTES + INT_SIZE_IN_BYTES + fileSpan.getGATKChunks().size()*LONG_SIZE_IN_BYTES*2); - buffer.putInt(binStart); - buffer.putInt(binStop); - buffer.putInt(fileSpan.getGATKChunks().size()); - for(GATKChunk chunk: fileSpan.getGATKChunks()) { - buffer.putLong(chunk.getChunkStart()); - buffer.putLong(chunk.getChunkEnd()); - } - maxChunkCount = Math.max(maxChunkCount,fileSpan.getGATKChunks().size()); - - // Prepare buffer for writing - buffer.flip(); - - // And write. - write(buffer); - } - - currentBinInLowestLevel++; - } - - final long readerStopOffset = position(); - - scheduleIterators.add(new PeekableIterator(new BAMScheduleIterator(reader,readerStartOffset,readerStopOffset,maxChunkCount))); - - // Iterator initialization might move the file pointer. Make sure it gets reset back to where it was before iterator initialization. - position(readerStopOffset); - } - - advance(); - } - - /** - * Determine whether more ScheduleEntries are present in the iterator. - * @return Next schedule entry to parse. - */ - @Override - public boolean hasNext() { - return nextScheduleEntry != null; - } - - /** - * Retrieve the next schedule entry in the list. - * @return next schedule entry in the queue. - */ - @Override - public BAMScheduleEntry next() { - BAMScheduleEntry currentScheduleEntry = nextScheduleEntry; - advance(); - return currentScheduleEntry; - } - - /** - * Close down and delete the file. - */ - @Override - public void close() { - try { - scheduleFileChannel.close(); - } - catch(IOException ex) { - throw makeIOFailureException(true, "Unable to close schedule file.", ex); - } - } - - /** - * Convenience routine for creating UserExceptions - * @param wasWriting - * @param message - * @param e - * @return - */ - private final GATKException makeIOFailureException(final boolean wasWriting, final String message, final Exception e) { - if ( wasWriting ) { - if ( e == null ) - return new UserException.CouldNotCreateOutputFile(scheduleFile, message); - else - return new UserException.CouldNotCreateOutputFile(scheduleFile, message, e); - } else { - if ( e == null ) - return new UserException.CouldNotReadInputFile(scheduleFile, message); - else - return new UserException.CouldNotReadInputFile(scheduleFile, message, e); - } - } - - /** - * Advance to the next schedule entry. - */ - private void advance() { - nextScheduleEntry = null; - - BitSet selectedIterators = new BitSet(readerIDs.size()); - int currentStart = Integer.MAX_VALUE; - int currentStop = Integer.MAX_VALUE; - - // Select every iterator whose next element is the lowest element in the list. - for(int reader = 0; reader < scheduleIterators.size(); reader++) { - PeekableIterator scheduleIterator = scheduleIterators.get(reader); - if(!scheduleIterator.hasNext()) - continue; - - // If the iterator starts after this one, skip over it. - if(scheduleIterator.peek().start > currentStart) - continue; - - // If the iterator starts at the same point as this one, add it to the list. - if(scheduleIterator.peek().start == currentStart) { - selectedIterators.set(reader); - currentStop = Math.min(scheduleIterator.peek().stop,currentStop); - continue; - } - - // If the iterator is less than anything seen before it, purge the selections and make this one current. - if(scheduleIterator.peek().start < currentStart) { - selectedIterators.clear(); - selectedIterators.set(reader); - currentStart = scheduleIterator.peek().start; - currentStop = scheduleIterator.peek().stop; - } - } - - // Out of iterators? Abort early. - if(selectedIterators.isEmpty()) - return; - - // Create the target schedule entry - BAMScheduleEntry mergedScheduleEntry = new BAMScheduleEntry(currentStart,currentStop); - - // For each schedule entry with data, load the data into the merged schedule. - for (int reader = selectedIterators.nextSetBit(0); reader >= 0; reader = selectedIterators.nextSetBit(reader+1)) { - PeekableIterator scheduleIterator = scheduleIterators.get(reader); - BAMScheduleEntry individualScheduleEntry = scheduleIterator.peek(); - mergedScheduleEntry.mergeInto(individualScheduleEntry); - - // If the schedule iterator ends after this entry, consume it. - if(individualScheduleEntry.stop <= currentStop) - scheduleIterator.next(); - } - - // For each schedule entry without data, add a blank entry. - for (int reader = selectedIterators.nextClearBit(0); reader < readerIDs.size(); reader = selectedIterators.nextClearBit(reader+1)) { - mergedScheduleEntry.addFileSpan(readerIDs.get(reader),new GATKBAMFileSpan()); - } - - nextScheduleEntry = mergedScheduleEntry; - } - - @Override - public void remove() { throw new UnsupportedOperationException("Unable to remove from a schedule iterator."); } - - /** - * Create a new schedule file, containing schedule information for all BAM files being dynamically merged. - */ - private void createScheduleFile() { - try { - scheduleFile = File.createTempFile("bamschedule."+referenceSequence,null); - scheduleFileChannel = new RandomAccessFile(scheduleFile,"rw").getChannel(); - } - catch(IOException ex) { - throw new UserException("Unable to create a temporary BAM schedule file. Please make sure Java can write to the default temp directory or use -Djava.io.tmpdir= to instruct it to use a different temp directory instead.",ex); - } - scheduleFile.deleteOnExit(); - - } - - /** - * Creates a new byte buffer of the given size. - * @param size the size of buffer to allocate. - * @return Newly allocated byte buffer. - */ - private ByteBuffer allocateByteBuffer(final int size) { - ByteBuffer buffer = ByteBuffer.allocate(size); - buffer.order(ByteOrder.LITTLE_ENDIAN); - return buffer; - } - - /** - * Reads the contents at the current position on disk into the given buffer. - * @param buffer buffer to fill. - */ - private int read(final ByteBuffer buffer) { - try { - return scheduleFileChannel.read(buffer); - } - catch(IOException ex) { - throw makeIOFailureException(false, "Unable to read data from BAM schedule file.", ex); - } - } - - private void write(final ByteBuffer buffer) { - try { - scheduleFileChannel.write(buffer); - if(buffer.remaining() > 0) - throw makeIOFailureException(true, "Unable to write entire buffer to file.", null); - } - catch(IOException ex) { - throw makeIOFailureException(true, "Unable to write data to BAM schedule file.", ex); - } - } - - /** - * Reads the current position from the file channel. - * @return Current position within file channel. - */ - private long position() { - try { - return scheduleFileChannel.position(); - } - catch(IOException ex) { - throw makeIOFailureException(false, "Unable to retrieve position of BAM schedule file.", ex); - } - } - - /** - * Reposition the file channel to the specified offset wrt the start of the file. - * @param position The position. - */ - private void position(final long position) { - try { - scheduleFileChannel.position(position); - } - catch(IOException ex) { - throw makeIOFailureException(false, "Unable to position BAM schedule file.",ex); - } - } - - /** - * An iterator over the schedule for a single BAM file. - */ - private class BAMScheduleIterator implements Iterator { - /** - * ID of the reader associated with the given schedule. - */ - private final SAMReaderID reader; - - /** - * Current position in the file. - */ - private long currentPosition; - - /** - * Stopping file position of last bin in file for this reader, exclusive. - */ - private final long stopPosition; - - /** - * Byte buffer used to store BAM header info. - */ - private final ByteBuffer binHeader; - - /** - * Byte buffer used to store chunk data. - */ - private final ByteBuffer chunkData; - - public BAMScheduleIterator(final SAMReaderID reader, final long startPosition, final long stopPosition, final int maxChunkCount) { - this.reader = reader; - this.currentPosition = startPosition; - this.stopPosition = stopPosition; - binHeader = allocateByteBuffer(INT_SIZE_IN_BYTES*3); - chunkData = allocateByteBuffer(maxChunkCount*LONG_SIZE_IN_BYTES*2); - } - - @Override - public boolean hasNext() { - return currentPosition < stopPosition; - } - - @Override - public BAMScheduleEntry next() { - position(currentPosition); - - // Read data. - int binHeaderBytesRead = read(binHeader); - - // Make sure we read in a complete bin header: - if ( binHeaderBytesRead < INT_SIZE_IN_BYTES * 3 ) { - throw new ReviewedGATKException(String.format("Unable to read a complete bin header from BAM schedule file %s for BAM file %s. " + - "The BAM schedule file is likely incomplete/corrupt.", - scheduleFile.getAbsolutePath(), reader.getSamFilePath())); - } - - // Decode contents. - binHeader.flip(); - final int start = binHeader.getInt(); - final int stop = binHeader.getInt(); - final int numChunks = binHeader.getInt(); - - // Prepare bin buffer for next read. - binHeader.flip(); - - // Prepare a target buffer for chunks. - GATKChunk[] chunks = new GATKChunk[numChunks]; - - // Read all chunk data. - chunkData.limit(numChunks*LONG_SIZE_IN_BYTES*2); - long bytesRead = read(chunkData); - if(bytesRead != numChunks*LONG_SIZE_IN_BYTES*2) - throw new ReviewedGATKException("Unable to read all chunks from file"); - - // Prepare for reading. - chunkData.flip(); - - for(int i = 0; i < numChunks; i++) - chunks[i] = new GATKChunk(chunkData.getLong(),chunkData.getLong()); - - // Prepare chunk buffer for next read. - chunkData.flip(); - - BAMScheduleEntry nextScheduleEntry = new BAMScheduleEntry(start,stop); - nextScheduleEntry.addFileSpan(reader,new GATKBAMFileSpan(chunks)); - - // Reset the position of the iterator at the next contig. - currentPosition = position(); - - return nextScheduleEntry; - } - - /** - * Not supported. - */ - @Override - public void remove() { - throw new UnsupportedOperationException("Unable to remove from a BAMScheduleIterator"); - } - - } -} - -/** - * A single proto-shard to be processed. - */ -class BAMScheduleEntry { - /** - * Starting position for the genomic entry. - */ - public final int start; - - /** - * Ending position for the genomic entry. - */ - public final int stop; - - /** - * The spans representing the given region. - */ - public final Map fileSpans = new HashMap(); - - BAMScheduleEntry(final int start, final int stop) { - this.start = start; - this.stop = stop; - } - - /** - * Add a new file span to this schedule. - * @param reader Reader associated with the span. - * @param fileSpan Blocks to read in the given reader. - */ - public void addFileSpan(final SAMReaderID reader, final GATKBAMFileSpan fileSpan) { - fileSpans.put(reader,fileSpan); - } - - /** - * A naive merge operation. Merge the fileSpans in other into this, blowing up if conflicts are - * detected. Completely ignores merging start and stop. - * @param other Other schedule entry to merging into this one. - */ - public void mergeInto(final BAMScheduleEntry other) { - final int thisSize = fileSpans.size(); - final int otherSize = other.fileSpans.size(); - fileSpans.putAll(other.fileSpans); - if(fileSpans.size() != thisSize+otherSize) - throw new ReviewedGATKException("Unable to handle overlaps when merging BAM schedule entries."); - } - - /** - * Returns true if the location of this bin tree is before the given position. - * @param locus Locus to test. - * @return True if this bin sits completely before the given locus; false otherwise. - */ - public boolean isBefore(final GenomeLoc locus) { - return stop < locus.getStart(); - } - - /** - * Checks overlap between this bin tree and other bin trees. - * @param position the position over which to detect overlap. - * @return True if the segment overlaps. False otherwise. - */ - public boolean overlaps(final GenomeLoc position) { - return !(position.getStop() < start || position.getStart() > stop); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMScheduler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMScheduler.java deleted file mode 100644 index 1ea8d39aa..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMScheduler.java +++ /dev/null @@ -1,320 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import htsjdk.samtools.util.PeekableIterator; -import htsjdk.samtools.GATKBAMFileSpan; -import htsjdk.samtools.GATKChunk; -import htsjdk.samtools.SAMSequenceRecord; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.GenomeLocSortedSet; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; -import org.broadinstitute.gatk.utils.sam.ReadUtils; - -import java.util.*; - -/** - * Assign intervals to the most appropriate blocks, keeping as little as possible in memory at once. - */ -public class BAMScheduler implements Iterator { - private final SAMDataSource dataSource; - - private final Map indexFiles = new HashMap(); - - private FilePointer nextFilePointer = null; - - private GenomeLocSortedSet loci; - private PeekableIterator locusIterator; - private GenomeLoc currentLocus; - private IntervalMergingRule intervalMergingRule; - - /* - * Creates BAMScheduler using contigs from the given BAM data source. - * - * @param dataSource BAM source - * @return non-null BAM scheduler - */ - public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource) { - final BAMScheduler scheduler = new BAMScheduler(dataSource, IntervalMergingRule.ALL); - final GenomeLocSortedSet intervals = GenomeLocSortedSet.createSetFromSequenceDictionary(dataSource.getHeader().getSequenceDictionary()); - scheduler.populateFilteredIntervalList(intervals); - return scheduler; - } - - public static BAMScheduler createOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) { - BAMScheduler scheduler = new BAMScheduler(dataSource, IntervalMergingRule.ALL); - scheduler.populateUnfilteredIntervalList(parser); - return scheduler; - } - - public static BAMScheduler createOverIntervals(final SAMDataSource dataSource, final IntervalMergingRule mergeRule, final GenomeLocSortedSet loci) { - BAMScheduler scheduler = new BAMScheduler(dataSource, mergeRule); - scheduler.populateFilteredIntervalList(loci); - return scheduler; - } - - - private BAMScheduler(final SAMDataSource dataSource, final IntervalMergingRule mergeRule) { - this.dataSource = dataSource; - this.intervalMergingRule = mergeRule; - for(SAMReaderID reader: dataSource.getReaderIDs()) { - GATKBAMIndex index = dataSource.getIndex(reader); - if(index != null) - indexFiles.put(reader,dataSource.getIndex(reader)); - } - } - - /** - * The consumer has asked for a bounded set of locations. Prepare an iterator over those locations. - * @param loci The list of locations to search and iterate over. - */ - private void populateFilteredIntervalList(final GenomeLocSortedSet loci) { - this.loci = loci; - if(!indexFiles.isEmpty()) { - // If index data is available, start up the iterator. - locusIterator = new PeekableIterator(loci.iterator()); - if(locusIterator.hasNext()) - currentLocus = locusIterator.next(); - advance(); - } - else { - // Otherwise, seed the iterator with a single file pointer over the entire region. - nextFilePointer = generatePointerOverEntireFileset(); - for(GenomeLoc locus: loci) - nextFilePointer.addLocation(locus); - locusIterator = new PeekableIterator(Collections.emptyList().iterator()); - } - } - - /** - * The consumer has provided null, meaning to iterate over all available data. Create a file pointer stretching - * from just before the start of the region to the end of the region. - */ - private void populateUnfilteredIntervalList(final GenomeLocParser parser) { - this.loci = new GenomeLocSortedSet(parser); - locusIterator = new PeekableIterator(Collections.emptyList().iterator()); - nextFilePointer = generatePointerOverEntireFileset(); - } - - /** - * Generate a span that runs from the end of the BAM header to the end of the fle. - * @return A file pointer over the specified region. - */ - private FilePointer generatePointerOverEntireFileset() { - FilePointer filePointer = new FilePointer(intervalMergingRule); - - // This is a "monolithic" FilePointer representing all regions in all files we will ever visit, and is - // the only FilePointer we will create. This allows us to have this FilePointer represent regions from - // multiple contigs - filePointer.setIsMonolithic(true); - - Map currentPosition; - - currentPosition = dataSource.getInitialReaderPositions(); - - for(SAMReaderID reader: dataSource.getReaderIDs()) - filePointer.addFileSpans(reader,createSpanToEndOfFile(currentPosition.get(reader).getGATKChunks().get(0).getChunkStart())); - return filePointer; - } - - public boolean hasNext() { - return nextFilePointer != null; - } - - public FilePointer next() { - if(!hasNext()) - throw new NoSuchElementException("No next element available in interval sharder"); - FilePointer currentFilePointer = nextFilePointer; - nextFilePointer = null; - advance(); - - return currentFilePointer; - } - - public void remove() { - throw new UnsupportedOperationException("Unable to remove FilePointers from an IntervalSharder"); - } - - private void advance() { - if(loci.isEmpty()) - return; - - while(nextFilePointer == null && currentLocus != null) { - // special case handling of the unmapped shard. - if(currentLocus == GenomeLoc.UNMAPPED) { - nextFilePointer = new FilePointer(intervalMergingRule, GenomeLoc.UNMAPPED); - for(SAMReaderID id: dataSource.getReaderIDs()) - nextFilePointer.addFileSpans(id,createSpanToEndOfFile(indexFiles.get(id).getStartOfLastLinearBin())); - currentLocus = null; - continue; - } - - nextFilePointer = new FilePointer(intervalMergingRule); - - int coveredRegionStart = 1; - int coveredRegionStop = Integer.MAX_VALUE; - GenomeLoc coveredRegion = null; - - BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(currentLocus); - - // No overlapping data at all. - if(scheduleEntry != null) { - coveredRegionStart = Math.max(coveredRegionStart,scheduleEntry.start); - coveredRegionStop = Math.min(coveredRegionStop,scheduleEntry.stop); - coveredRegion = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStart,coveredRegionStop); - - nextFilePointer.addFileSpans(scheduleEntry.fileSpans); - } - else { - // Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty. - for(SAMReaderID reader: indexFiles.keySet()) - nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan()); - } - - // Early exit if no bins were found. - if(coveredRegion == null) { - // for debugging only: maximum split is 16384. - nextFilePointer.addLocation(currentLocus); - currentLocus = locusIterator.hasNext() ? locusIterator.next() : null; - continue; - } - - // Early exit if only part of the first interval was found. - if(currentLocus.startsBefore(coveredRegion)) { - int splitPoint = Math.min(coveredRegion.getStart()-currentLocus.getStart(),16384)+currentLocus.getStart(); - GenomeLoc[] splitContigs = currentLocus.split(splitPoint); - nextFilePointer.addLocation(splitContigs[0]); - currentLocus = splitContigs[1]; - continue; - } - - // Define the initial range of the file pointer, aka the region where the locus currently being processed intersects the BAM list. - GenomeLoc initialLocation = currentLocus.intersect(coveredRegion); - nextFilePointer.addLocation(initialLocation); - - // See whether the BAM regions discovered overlap the next set of intervals in the interval list. If so, include every overlapping interval. - if(!nextFilePointer.locations.isEmpty()) { - while(locusIterator.hasNext() && locusIterator.peek().overlapsP(coveredRegion)) { - currentLocus = locusIterator.next(); - nextFilePointer.addLocation(currentLocus.intersect(coveredRegion)); - } - - // Chop off the uncovered portion of the locus. Since we know that the covered region overlaps the current locus, - // we can simplify the interval creation process to the end of the covered region to the stop of the given interval. - if(coveredRegionStop < currentLocus.getStop()) - currentLocus = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStop+1,currentLocus.getStop()); - else if(locusIterator.hasNext()) - currentLocus = locusIterator.next(); - else - currentLocus = null; - } - - } - } - - - /** - * The last reference sequence processed by this iterator. - */ - private Integer lastReferenceSequenceLoaded = null; - - /** - * The stateful iterator used to progress through the genoem. - */ - private PeekableIterator bamScheduleIterator = null; - - /** - * Clean up underlying BAMSchedule file handles. - */ - public void close() { - if(bamScheduleIterator != null) - bamScheduleIterator.close(); - } - - /** - * Get the next overlapping tree of bins associated with the given BAM file. - * @param currentLocus The actual locus for which to check overlap. - * @return The next schedule entry overlapping with the given list of loci. - */ - private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final GenomeLoc currentLocus) { - // Make sure that we consult the BAM header to ensure that we're using the correct contig index for this contig name. - // This will ensure that if the two sets of contigs don't quite match (b36 male vs female ref, hg19 Epstein-Barr), then - // we'll be using the correct contig index for the BAMs. - // TODO: Warning: assumes all BAMs use the same sequence dictionary! Get around this with contig aliasing. - SAMSequenceRecord currentContigSequenceRecord = dataSource.getHeader().getSequence(currentLocus.getContig()); - if ( currentContigSequenceRecord == null ) { - throw new UserException(String.format("Contig %s not present in sequence dictionary for merged BAM header: %s", - currentLocus.getContig(), - ReadUtils.prettyPrintSequenceRecords(dataSource.getHeader().getSequenceDictionary()))); - } - - final int currentContigIndex = currentContigSequenceRecord.getSequenceIndex(); - - // Stale reference sequence or first invocation. (Re)create the binTreeIterator. - if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentContigIndex) { - if(bamScheduleIterator != null) - bamScheduleIterator.close(); - lastReferenceSequenceLoaded = currentContigIndex; - - // Naive algorithm: find all elements in current contig for proper schedule creation. - List lociInContig = new LinkedList(); - for(GenomeLoc locus: loci) { - if (!GenomeLoc.isUnmapped(locus) && dataSource.getHeader().getSequence(locus.getContig()) == null) - throw new ReviewedGATKException("BAM file(s) do not have the contig: " + locus.getContig() + ". You are probably using a different reference than the one this file was aligned with"); - - if (!GenomeLoc.isUnmapped(locus) && dataSource.getHeader().getSequence(locus.getContig()).getSequenceIndex() == lastReferenceSequenceLoaded) - lociInContig.add(locus); - } - - bamScheduleIterator = new PeekableIterator(new BAMSchedule(dataSource,lociInContig)); - } - - if(!bamScheduleIterator.hasNext()) - return null; - - // Peek the iterator along until finding the first binTree at or following the current locus. - BAMScheduleEntry bamScheduleEntry = bamScheduleIterator.peek(); - while(bamScheduleEntry != null && bamScheduleEntry.isBefore(currentLocus)) { - bamScheduleIterator.next(); - bamScheduleEntry = bamScheduleIterator.hasNext() ? bamScheduleIterator.peek() : null; - } - - return (bamScheduleEntry != null && bamScheduleEntry.overlaps(currentLocus)) ? bamScheduleEntry : null; - } - - /** - * Create a span from the given start point to the end of the file. - * @param startOfRegion Start of the region, in encoded coordinates (block start << 16 & block offset). - * @return A file span from the given point to the end of the file. - */ - private GATKBAMFileSpan createSpanToEndOfFile(final long startOfRegion) { - return new GATKBAMFileSpan(new GATKChunk(startOfRegion,Long.MAX_VALUE)); - } - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockInputStream.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockInputStream.java deleted file mode 100644 index 11fecb661..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockInputStream.java +++ /dev/null @@ -1,450 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import htsjdk.samtools.GATKBAMFileSpan; -import htsjdk.samtools.GATKChunk; -import htsjdk.samtools.util.BlockCompressedInputStream; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.io.IOException; -import java.io.InputStream; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; - -/** - * Presents decompressed blocks to the SAMFileReader. - */ -public class BlockInputStream extends InputStream { - /** - * Mechanism for triggering block loads. - */ - private final BGZFBlockLoadingDispatcher dispatcher; - - /** - * The reader whose data is supplied by this input stream. - */ - private final SAMReaderID reader; - - /** - * Length of the input stream. - */ - private final long length; - - /** - * The latest error reported by an asynchronous block load. - */ - private Throwable error; - - /** - * Current accessPlan. - */ - private BAMAccessPlan accessPlan; - - /** - * A stream of compressed data blocks. - */ - private final ByteBuffer buffer; - - /** - * Offsets of the given blocks in the buffer. - */ - private LinkedList blockOffsets = new LinkedList(); - - /** - * Source positions of the given blocks in the buffer. - */ - private LinkedList blockPositions = new LinkedList(); - - /** - * Provides a lock to wait for more data to arrive. - */ - private final Object lock = new Object(); - - /** - * An input stream to use when comparing data back to what it should look like. - */ - private final BlockCompressedInputStream validatingInputStream; - - /** - * Create a new block presenting input stream with a dedicated buffer. - * @param dispatcher the block loading messenger. - * @param reader the reader for which to load data. - * @param validate validates the contents read into the buffer against the contents of a Picard BlockCompressedInputStream. - */ - BlockInputStream(final BGZFBlockLoadingDispatcher dispatcher, final SAMReaderID reader, final boolean validate) { - this.reader = reader; - this.length = reader.samFile.length(); - - buffer = ByteBuffer.wrap(new byte[64*1024]); - buffer.order(ByteOrder.LITTLE_ENDIAN); - - // The state of the buffer assumes that the range of data written into the buffer appears in the range - // [position,limit), while extra capacity exists in the range [limit,capacity) - buffer.limit(0); - - this.dispatcher = dispatcher; - // TODO: Kill the region when all we want to do is start at the beginning of the stream and run to the end of the stream. - this.accessPlan = new BAMAccessPlan(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE))); - - // The block offsets / block positions guarantee that the ending offset/position in the data structure maps to - // the point in the file just following the last read. These two arrays should never be empty; initializing - // to 0 to match the position above. - this.blockOffsets.add(0); - this.blockPositions.add(0L); - - try { - if(validate) { - System.out.printf("BlockInputStream %s: BGZF block validation mode activated%n",this); - validatingInputStream = new BlockCompressedInputStream(reader.samFile); - // A bug in ValidatingInputStream means that calling getFilePointer() immediately after initialization will result in an NPE. - // Poke the stream to start reading data. - validatingInputStream.available(); - } - else - validatingInputStream = null; - } - catch(IOException ex) { - throw new ReviewedGATKException("Unable to validate against Picard input stream",ex); - } - } - - public long length() { - return length; - } - - public long getFilePointer() { - long filePointer; - synchronized(lock) { - // Find the current block within the input stream. - int blockIndex; - for(blockIndex = 0; blockIndex+1 < blockOffsets.size() && buffer.position() > blockOffsets.get(blockIndex+1); blockIndex++) - ; - filePointer = blockPositions.get(blockIndex) + (buffer.position()-blockOffsets.get(blockIndex)); - } - -// if(validatingInputStream != null && filePointer != validatingInputStream.getFilePointer()) -// throw new ReviewedGATKException(String.format("Position of input stream is invalid; expected (block address, block offset) = (%d,%d), got (%d,%d)", -// BlockCompressedFilePointerUtil.getBlockAddress(validatingInputStream.getFilePointer()),BlockCompressedFilePointerUtil.getBlockOffset(validatingInputStream.getFilePointer()), -// BlockCompressedFilePointerUtil.getBlockAddress(filePointer),BlockCompressedFilePointerUtil.getBlockOffset(filePointer))); - - return filePointer; - } - - private void clearBuffers() { - this.accessPlan.reset(); - - // Buffer semantics say that outside of a lock, buffer should always be prepared for reading. - // Indicate no data to be read. - buffer.clear(); - buffer.limit(0); - - // Clear everything except the last block offset / position - blockOffsets.clear(); - blockOffsets.add(0); - while(blockPositions.size() > 1) - blockPositions.removeFirst(); - } - - public boolean eof() { - synchronized(lock) { - // TODO: Handle multiple empty BGZF blocks at end of the file. - return accessPlan != null && (accessPlan.getBlockAddress() < 0 || accessPlan.getBlockAddress() >= length); - } - } - - /** - * Submits a new access plan for the given dataset and seeks to the given point. - * @param accessPlan The next seek point for BAM data in this reader. - */ - public void submitAccessPlan(final BAMAccessPlan accessPlan) { - //System.out.printf("Thread %s: submitting access plan for block at position: %d%n",Thread.currentThread().getId(),position.getBlockAddress()); - this.accessPlan = accessPlan; - accessPlan.reset(); - - clearBuffers(); - - // Pull the iterator past any oddball chunks at the beginning of the shard (chunkEnd < chunkStart, empty chunks, etc). - // TODO: Don't pass these empty chunks in. - accessPlan.advancePosition(makeFilePointer(accessPlan.getBlockAddress(),0)); - - if(accessPlan.getBlockAddress() >= 0) { - waitForBufferFill(); - } - - if(validatingInputStream != null) { - try { - validatingInputStream.seek(makeFilePointer(accessPlan.getBlockAddress(),0)); - } - catch(IOException ex) { - throw new ReviewedGATKException("Unable to validate against Picard input stream",ex); - } - } - - } - - - private void compactBuffer() { - // Compact buffer to maximize storage space. - int bytesToRemove = 0; - - // Look ahead to see if we can compact away the first blocks in the series. - while(blockOffsets.size() > 1 && buffer.position() >= blockOffsets.get(1)) { - blockOffsets.remove(); - blockPositions.remove(); - bytesToRemove = blockOffsets.peek(); - } - - // If we end up with an empty block at the end of the series, compact this as well. - if(buffer.remaining() == 0 && blockOffsets.size() > 1 && buffer.position() >= blockOffsets.peek()) { - bytesToRemove += buffer.position(); - blockOffsets.remove(); - blockPositions.remove(); - } - - int finalBufferStart = buffer.position() - bytesToRemove; - int finalBufferSize = buffer.remaining(); - - // Position the buffer to remove the unneeded data, and compact it away. - buffer.position(bytesToRemove); - buffer.compact(); - - // Reset the limits for reading. - buffer.position(finalBufferStart); - buffer.limit(finalBufferStart+finalBufferSize); - - // Shift everything in the offset buffer down to accommodate the bytes removed from the buffer. - for(int i = 0; i < blockOffsets.size(); i++) - blockOffsets.set(i,blockOffsets.get(i)-bytesToRemove); - } - - /** - * Push contents of incomingBuffer into the end of this buffer. - * MUST be called from a thread that is NOT the reader thread. - * @param incomingBuffer The data being pushed into this input stream. - * @param accessPlan target access plan for the data. - * @param filePosition the current position of the file pointer - */ - public void copyIntoBuffer(final ByteBuffer incomingBuffer, final BAMAccessPlan accessPlan, final long filePosition) { - synchronized(lock) { - try { - if(validatingInputStream != null) { - byte[] validBytes = new byte[incomingBuffer.remaining()]; - - byte[] currentBytes = new byte[incomingBuffer.remaining()]; - int pos = incomingBuffer.position(); - int lim = incomingBuffer.limit(); - incomingBuffer.get(currentBytes); - - incomingBuffer.limit(lim); - incomingBuffer.position(pos); - - long currentFilePointer = validatingInputStream.getFilePointer(); - validatingInputStream.seek(makeFilePointer(accessPlan.getBlockAddress(), 0)); - validatingInputStream.read(validBytes); - validatingInputStream.seek(currentFilePointer); - - if(!Arrays.equals(validBytes,currentBytes)) - throw new ReviewedGATKException(String.format("Bytes being inserted into BlockInputStream %s are incorrect",this)); - } - - compactBuffer(); - // Open up the buffer for more reading. - buffer.limit(buffer.capacity()); - - // Get the spans overlapping this particular block... - List spansOverlapping = accessPlan.getSpansOverlappingBlock(accessPlan.getBlockAddress(),filePosition); - - // ...and advance the block - this.accessPlan = accessPlan; - accessPlan.advancePosition(makeFilePointer(filePosition, 0)); - - if(buffer.remaining() < incomingBuffer.remaining()) - lock.wait(); - - final int bytesInIncomingBuffer = incomingBuffer.limit(); - - for(GATKChunk spanOverlapping: spansOverlapping) { - // Clear out the endcap tracking state and add in the starting position for this transfer. - blockOffsets.removeLast(); - blockOffsets.add(buffer.position()); - blockPositions.removeLast(); - blockPositions.add(spanOverlapping.getChunkStart()); - - // Stream the buffer into the data stream. - incomingBuffer.limit((spanOverlapping.getBlockEnd() > spanOverlapping.getBlockStart()) ? bytesInIncomingBuffer : spanOverlapping.getBlockOffsetEnd()); - incomingBuffer.position(spanOverlapping.getBlockOffsetStart()); - buffer.put(incomingBuffer); - - // Add the endcap for this transfer. - blockOffsets.add(buffer.position()); - blockPositions.add(spanOverlapping.getChunkEnd()); - } - - // Set up the buffer for reading. - buffer.flip(); - - lock.notify(); - } - catch(Exception ex) { - reportException(ex); - lock.notify(); - } - } - } - - void reportException(Throwable t) { - synchronized(lock) { - this.error = t; - lock.notify(); - } - } - - private void checkForErrors() { - synchronized(lock) { - if(error != null) { - ReviewedGATKException toThrow = new ReviewedGATKException(String.format("Thread %s, BlockInputStream %s: Unable to retrieve BAM data from disk",Thread.currentThread().getId(),this),error); - toThrow.setStackTrace(error.getStackTrace()); - throw toThrow; - } - } - } - - /** - * Reads the next byte of data from the input stream. - * @return Next byte of data, from 0->255, as an int. - */ - @Override - public int read() { - byte[] singleByte = new byte[1]; - read(singleByte); - return singleByte[0]; - } - - /** - * Fills the given byte array to the extent possible. - * @param bytes byte array to be filled. - * @return The number of bytes actually read. - */ - @Override - public int read(byte[] bytes) { - return read(bytes,0,bytes.length); - } - - @Override - public int read(byte[] bytes, final int offset, final int length) { - int remaining = length; - synchronized(lock) { - while(remaining > 0) { - // Check for error conditions during last read. - checkForErrors(); - - // If completely out of space, queue up another buffer fill. - waitForBufferFill(); - - // Couldn't manage to load any data at all; abort and return what's available. - if(buffer.remaining() == 0) - break; - - int numBytesToCopy = Math.min(buffer.remaining(),remaining); - buffer.get(bytes,length-remaining+offset,numBytesToCopy); - remaining -= numBytesToCopy; - - //if(remaining > 0) - // System.out.printf("Thread %s: read the first %d bytes of a %d byte request%n",Thread.currentThread().getId(),length-remaining,length); - // TODO: Assert that we don't copy across a block boundary - } - - // Notify any waiting threads that some of the contents of the buffer were removed. - if(length-remaining > 0) - lock.notify(); - } - -// if(validatingInputStream != null) { -// byte[] validBytes = new byte[length]; -// try { -// validatingInputStream.read(validBytes,offset,length); -// for(int i = offset; i < offset+length; i++) { -// if(bytes[i] != validBytes[i]) -// throw new ReviewedGATKException(String.format("Thread %s: blockInputStream %s attempting to return wrong set of bytes; mismatch at offset %d",Thread.currentThread().getId(),this,i)); -// } -// } -// catch(IOException ex) { -// throw new ReviewedGATKException("Unable to validate against Picard input stream",ex); -// } -// } - - // If any data was copied into the buffer, return the amount of data copied. - if(remaining < length) - return length - remaining; - - // Otherwise, return -1. - return -1; - } - - public void close() { - if(validatingInputStream != null) { - try { - validatingInputStream.close(); - } - catch(IOException ex) { - throw new ReviewedGATKException("Unable to validate against Picard input stream",ex); - } - } - } - - public String getSource() { - return reader.getSamFilePath(); - } - - private void waitForBufferFill() { - synchronized(lock) { - if(buffer.remaining() == 0 && !eof()) { - //System.out.printf("Thread %s is waiting for a buffer fill from position %d to buffer %s%n",Thread.currentThread().getId(),position.getBlockAddress(),this); - dispatcher.queueBlockLoad(accessPlan); - try { - lock.wait(); - } - catch(InterruptedException ex) { - throw new ReviewedGATKException("Interrupt occurred waiting for buffer to fill",ex); - } - } - } - } - - /** - * Create an encoded BAM file pointer given the address of a BGZF block and an offset. - * @param blockAddress Physical address on disk of a BGZF block. - * @param blockOffset Offset into the uncompressed data stored in the BGZF block. - * @return 64-bit pointer encoded according to the BAM spec. - */ - public static long makeFilePointer(final long blockAddress, final int blockOffset) { - return blockAddress << 16 | blockOffset; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FileHandleCache.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FileHandleCache.java deleted file mode 100644 index 8d5ab3b03..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FileHandleCache.java +++ /dev/null @@ -1,232 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.GATKException; - -import java.io.FileInputStream; -import java.io.IOException; -import java.util.Collection; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Queue; - -/** - * Caches frequently used file handles. Right now, caches only a single file handle. - * TODO: Generalize to support arbitrary file handle caches. - */ -public class FileHandleCache { - /** - * The underlying data structure storing file handles. - */ - private final FileHandleStorage fileHandleStorage; - - /** - * How many file handles should be kept open at once. - */ - private final int cacheSize; - - /** - * A uniquifier: assign a unique ID to every instance of a file handle. - */ - private final Map keyCounter = new HashMap(); - - /** - * A shared lock, private so that outside users cannot notify it. - */ - private final Object lock = new Object(); - - /** - * Indicates how many file handles are outstanding at this point. - */ - private int numOutstandingFileHandles = 0; - - /** - * Create a new file handle cache of the given cache size. - * @param cacheSize how many readers to hold open at once. - */ - public FileHandleCache(final int cacheSize) { - this.cacheSize = cacheSize; - fileHandleStorage = new FileHandleStorage(); - } - - /** - * Retrieves or opens a file handle for the given reader ID. - * @param key The ke - * @return A file input stream from the cache, if available, or otherwise newly opened. - */ - public FileInputStream claimFileInputStream(final SAMReaderID key) { - synchronized(lock) { - FileInputStream inputStream = findExistingEntry(key); - if(inputStream == null) { - try { - // If the cache is maxed out, wait for another file handle to emerge. - if(numOutstandingFileHandles >= cacheSize) - lock.wait(); - } - catch(InterruptedException ex) { - throw new ReviewedGATKException("Interrupted while waiting for a file handle"); - } - inputStream = openInputStream(key); - } - numOutstandingFileHandles++; - - //System.out.printf("Handing input stream %s to thread %s%n",inputStream,Thread.currentThread().getId()); - return inputStream; - } - } - - /** - * Releases the current reader and returns it to the cache. - * @param key The reader. - * @param inputStream The stream being used. - */ - public void releaseFileInputStream(final SAMReaderID key, final FileInputStream inputStream) { - synchronized(lock) { - numOutstandingFileHandles--; - UniqueKey newID = allocateKey(key); - fileHandleStorage.put(newID,inputStream); - // Let any listeners know that another file handle has become available. - lock.notify(); - } - } - - /** - * Finds an existing entry in the storage mechanism. - * @param key Reader. - * @return a cached stream, if available. Otherwise, - */ - private FileInputStream findExistingEntry(final SAMReaderID key) { - int existingHandles = getMostRecentUniquifier(key); - - // See if any of the keys currently exist in the repository. - for(int i = 0; i <= existingHandles; i++) { - UniqueKey uniqueKey = new UniqueKey(key,i); - if(fileHandleStorage.containsKey(uniqueKey)) - return fileHandleStorage.remove(uniqueKey); - } - - return null; - } - - /** - * Gets the most recent uniquifier used for the given reader. - * @param reader Reader for which to determine uniqueness. - * @return - */ - private int getMostRecentUniquifier(final SAMReaderID reader) { - if(keyCounter.containsKey(reader)) - return keyCounter.get(reader); - else return -1; - } - - private UniqueKey allocateKey(final SAMReaderID reader) { - int uniquifier = getMostRecentUniquifier(reader)+1; - keyCounter.put(reader,uniquifier); - return new UniqueKey(reader,uniquifier); - } - - private FileInputStream openInputStream(final SAMReaderID reader) { - try { - return new FileInputStream(reader.getSamFilePath()); - } - catch(IOException ex) { - throw new GATKException("Unable to open input file"); - } - } - - private void closeInputStream(final FileInputStream inputStream) { - try { - inputStream.close(); - } - catch(IOException ex) { - throw new GATKException("Unable to open input file"); - } - } - - /** - * Actually contains the file handles, purging them as they get too old. - */ - private class FileHandleStorage extends LinkedHashMap { - /** - * Remove the oldest entry - * @param entry Entry to consider removing. - * @return True if the cache size has been exceeded. False otherwise. - */ - @Override - protected boolean removeEldestEntry(Map.Entry entry) { - synchronized (lock) { - if(size() > cacheSize) { - keyCounter.put(entry.getKey().key,keyCounter.get(entry.getKey().key)-1); - closeInputStream(entry.getValue()); - - return true; - } - } - return false; - } - } - - /** - * Uniquifies a key by adding a numerical uniquifier. - */ - private class UniqueKey { - /** - * The file handle's key. - */ - private final SAMReaderID key; - - /** - * A uniquifier, so that multiple of the same reader can exist in the cache. - */ - private final int uniqueID; - - public UniqueKey(final SAMReaderID reader, final int uniqueID) { - this.key = reader; - this.uniqueID = uniqueID; - } - - @Override - public boolean equals(Object other) { - if(!(other instanceof UniqueKey)) - return false; - UniqueKey otherUniqueKey = (UniqueKey)other; - return key.equals(otherUniqueKey.key) && this.uniqueID == otherUniqueKey.uniqueID; - } - - @Override - public int hashCode() { - return key.hashCode(); - } - } - - - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointer.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointer.java deleted file mode 100644 index 99d9def5a..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointer.java +++ /dev/null @@ -1,436 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import htsjdk.samtools.util.PeekableIterator; -import htsjdk.samtools.GATKBAMFileSpan; -import htsjdk.samtools.GATKChunk; -import htsjdk.samtools.SAMFileSpan; -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; -import org.broadinstitute.gatk.utils.interval.IntervalUtils; - -import java.util.*; - -/** - * Represents a small section of a BAM file, and every associated interval. - */ -public class FilePointer { - protected final SortedMap fileSpans = new TreeMap(); - protected final List locations = new ArrayList(); - protected final IntervalMergingRule intervalMergingRule; - - /** - * Does this file pointer point into an unmapped region? - */ - protected final boolean isRegionUnmapped; - - /** - * Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will - * ever visit during this GATK run? If this is set to true, the engine will expect to see only this - * one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals - * from more than one contig. - */ - private boolean isMonolithic = false; - - /** - * Index of the contig covered by this FilePointer. Only meaningful for non-monolithic, mapped FilePointers - */ - private Integer contigIndex = null; - - - public FilePointer( final IntervalMergingRule mergeRule, final List locations ) { - this.intervalMergingRule = mergeRule; - this.locations.addAll(locations); - this.isRegionUnmapped = checkUnmappedStatus(); - - validateAllLocations(); - if ( locations.size() > 0 ) { - contigIndex = locations.get(0).getContigIndex(); - } - } - - public FilePointer( final IntervalMergingRule mergeRule, final GenomeLoc... locations ) { - this(mergeRule, Arrays.asList(locations)); - } - - public FilePointer( final Map fileSpans, final IntervalMergingRule mergeRule, final List locations ) { - this(mergeRule, locations); - this.fileSpans.putAll(fileSpans); - } - - private boolean checkUnmappedStatus() { - boolean foundMapped = false, foundUnmapped = false; - - for( GenomeLoc location: locations ) { - if ( GenomeLoc.isUnmapped(location) ) - foundUnmapped = true; - else - foundMapped = true; - } - if ( foundMapped && foundUnmapped ) - throw new ReviewedGATKException("BUG: File pointers cannot be mixed mapped/unmapped."); - - return foundUnmapped; - } - - private void validateAllLocations() { - // Unmapped and monolithic FilePointers are exempted from the one-contig-only restriction - if ( isRegionUnmapped || isMonolithic ) { - return; - } - - Integer previousContigIndex = null; - - for ( GenomeLoc location : locations ) { - if ( previousContigIndex != null && previousContigIndex != location.getContigIndex() ) { - throw new ReviewedGATKException("Non-monolithic file pointers must contain intervals from at most one contig"); - } - - previousContigIndex = location.getContigIndex(); - } - } - - private void validateLocation( GenomeLoc location ) { - if ( isRegionUnmapped != GenomeLoc.isUnmapped(location) ) { - throw new ReviewedGATKException("BUG: File pointers cannot be mixed mapped/unmapped."); - } - if ( ! isRegionUnmapped && ! isMonolithic && contigIndex != null && contigIndex != location.getContigIndex() ) { - throw new ReviewedGATKException("Non-monolithic file pointers must contain intervals from at most one contig"); - } - } - - /** - * Returns an immutable view of this FilePointer's file spans - * - * @return an immutable view of this FilePointer's file spans - */ - public Map getFileSpans() { - return Collections.unmodifiableMap(fileSpans); - } - - /** - * Returns an immutable variant of the list of locations. - * @return - */ - public List getLocations() { - return Collections.unmodifiableList(locations); - } - - /** - * Returns the index of the contig into which this FilePointer points (a FilePointer can represent - * regions in at most one contig). - * - * @return the index of the contig into which this FilePointer points - */ - public int getContigIndex() { - return locations.size() > 0 ? locations.get(0).getContigIndex() : SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; - } - - /** - * Returns the IntervalMergingRule used by this FilePointer to merge adjacent locations - * - * @return the IntervalMergingRule used by this FilePointer (never null) - */ - public IntervalMergingRule getIntervalMergingRule() { - return intervalMergingRule; - } - - /** - * Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will - * ever visit during this GATK run? If this is set to true, the engine will expect to see only this - * one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals - * from more than one contig. - * - * @return true if this FP is a monolithic FP representing all regions in all files, otherwise false - */ - public boolean isMonolithic() { - return isMonolithic; - } - - /** - * Set this FP's "monolithic" status to true or false. An FP is monolithic if it represents all - * regions in all files that we will ever visit, and is the only FP we will ever create. A monolithic - * FP may contain intervals from more than one contig. - * - * @param isMonolithic set this FP's monolithic status to this value - */ - public void setIsMonolithic( boolean isMonolithic ) { - this.isMonolithic = isMonolithic; - } - - @Override - public boolean equals(final Object other) { - if(!(other instanceof FilePointer)) - return false; - FilePointer otherFilePointer = (FilePointer)other; - - // intervals - if(this.locations.size() != otherFilePointer.locations.size()) - return false; - for(int i = 0; i < locations.size(); i++) { - if(!this.locations.get(i).equals(otherFilePointer.locations.get(i))) - return false; - } - - // fileSpans - if(this.fileSpans.size() != otherFilePointer.fileSpans.size()) - return false; - Iterator> thisEntries = this.fileSpans.entrySet().iterator(); - Iterator> otherEntries = otherFilePointer.fileSpans.entrySet().iterator(); - while(thisEntries.hasNext() || otherEntries.hasNext()) { - if(!thisEntries.next().equals(otherEntries.next())) - return false; - } - - return true; - } - - public void addLocation(final GenomeLoc location) { - validateLocation(location); - - this.locations.add(location); - if ( contigIndex == null ) { - contigIndex = location.getContigIndex(); - } - } - - public void addFileSpans(final SAMReaderID id, final SAMFileSpan fileSpan) { - this.fileSpans.put(id,fileSpan); - } - - public void addFileSpans(final Map fileSpans) { - this.fileSpans.putAll(fileSpans); - } - - - /** - * Computes the size of this file span, in uncompressed bytes. - * @return Size of the file span. - */ - public long size() { - long size = 0L; - for(SAMFileSpan fileSpan: fileSpans.values()) - size += ((GATKBAMFileSpan)fileSpan).size(); - return size; - } - - /** - * Returns the difference in size between two filespans. - * @param other Other filespan against which to measure. - * @return The difference in size between the two file pointers. - */ - public long minus(final FilePointer other) { - long difference = 0; - PeekableIterator> thisIterator = new PeekableIterator>(this.fileSpans.entrySet().iterator()); - PeekableIterator> otherIterator = new PeekableIterator>(other.fileSpans.entrySet().iterator()); - - while(thisIterator.hasNext()) { - // If there are no elements left in the 'other' iterator, spin out this iterator. - if(!otherIterator.hasNext()) { - GATKBAMFileSpan nextSpan = (GATKBAMFileSpan)thisIterator.next().getValue(); - difference += nextSpan.size(); - continue; - } - - // Otherwise, compare the latest value. - int compareValue = thisIterator.peek().getKey().compareTo(otherIterator.peek().getKey()); - - if(compareValue < 0) { - // This before other. - difference += ((GATKBAMFileSpan)thisIterator.next().getValue()).size(); - } - else if(compareValue > 0) { - // Other before this. - difference += ((GATKBAMFileSpan)otherIterator.next().getValue()).size(); - } - else { - // equality; difference the values. - GATKBAMFileSpan thisRegion = (GATKBAMFileSpan)thisIterator.next().getValue(); - GATKBAMFileSpan otherRegion = (GATKBAMFileSpan)otherIterator.next().getValue(); - difference += Math.abs(thisRegion.minus(otherRegion).size()); - } - } - return difference; - } - - /** - * Combines two file pointers into one. - * @param parser The genomelocparser to use when manipulating intervals. - * @param other File pointer to combine into this one. - * @return A completely new file pointer that is the combination of the two. - */ - public FilePointer combine(final GenomeLocParser parser, final FilePointer other) { - FilePointer combined = new FilePointer(intervalMergingRule); - - List intervals = new ArrayList(); - intervals.addAll(locations); - intervals.addAll(other.locations); - for(GenomeLoc interval: IntervalUtils.sortAndMergeIntervals(parser,intervals,intervalMergingRule)) - combined.addLocation(interval); - - PeekableIterator> thisIterator = new PeekableIterator>(this.fileSpans.entrySet().iterator()); - PeekableIterator> otherIterator = new PeekableIterator>(other.fileSpans.entrySet().iterator()); - - while(thisIterator.hasNext() || otherIterator.hasNext()) { - int compareValue; - if(!otherIterator.hasNext()) { - compareValue = -1; - } - else if(!thisIterator.hasNext()) - compareValue = 1; - else - compareValue = thisIterator.peek().getKey().compareTo(otherIterator.peek().getKey()); - - // This before other. - if(compareValue < 0) - mergeElementsInto(combined,thisIterator); - // Other before this. - else if(compareValue > 0) - mergeElementsInto(combined,otherIterator); - // equality; union the values. - else - mergeElementsInto(combined,thisIterator,otherIterator); - } - return combined; - } - - /** - * Roll the next element in the iterator into the combined entry. - * @param combined Entry into which to roll the next element. - * @param iterators Sources of next elements. - */ - private void mergeElementsInto(final FilePointer combined, Iterator>... iterators) { - if(iterators.length == 0) - throw new ReviewedGATKException("Tried to add zero elements to an existing file pointer."); - Map.Entry initialElement = iterators[0].next(); - GATKBAMFileSpan fileSpan = (GATKBAMFileSpan)initialElement.getValue(); - for(int i = 1; i < iterators.length; i++) - fileSpan = fileSpan.union((GATKBAMFileSpan)iterators[i].next().getValue()); - combined.addFileSpans(initialElement.getKey(),fileSpan); - } - - /** - * Efficiently generate the union of the n FilePointers passed in. Much more efficient than - * combining two FilePointers at a time using the combine() method above. - * - * IMPORTANT: the FilePointers to be unioned must either all represent regions on the - * same contig, or all be unmapped, since we cannot create FilePointers with a mix of - * contigs or with mixed mapped/unmapped regions. - * - * @param filePointers the FilePointers to union - * @param parser our GenomeLocParser - * @return the union of the FilePointers passed in - */ - public static FilePointer union( List filePointers, GenomeLocParser parser ) { - if ( filePointers == null || filePointers.isEmpty() ) { - return new FilePointer(IntervalMergingRule.ALL); - } - - Map> fileChunks = new HashMap>(); - List locations = new ArrayList(); - IntervalMergingRule mergeRule = filePointers.get(0).getIntervalMergingRule(); - - // First extract all intervals and file chunks from the FilePointers into unsorted, unmerged collections - for ( FilePointer filePointer : filePointers ) { - locations.addAll(filePointer.getLocations()); - if (mergeRule != filePointer.getIntervalMergingRule()) - throw new ReviewedGATKException("All FilePointers in FilePointer.union() must have use the same IntervalMergeRule"); - - for ( Map.Entry fileSpanEntry : filePointer.getFileSpans().entrySet() ) { - GATKBAMFileSpan fileSpan = (GATKBAMFileSpan)fileSpanEntry.getValue(); - - if ( fileChunks.containsKey(fileSpanEntry.getKey()) ) { - fileChunks.get(fileSpanEntry.getKey()).addAll(fileSpan.getGATKChunks()); - } - else { - fileChunks.put(fileSpanEntry.getKey(), fileSpan.getGATKChunks()); - } - } - } - - // Now sort and merge the intervals - List sortedMergedLocations = new ArrayList(); - sortedMergedLocations.addAll(IntervalUtils.sortAndMergeIntervals(parser, locations, mergeRule)); - - // For each BAM file, convert from an unsorted, unmerged list of chunks to a GATKBAMFileSpan containing - // the sorted, merged union of the chunks for that file - Map mergedFileSpans = new HashMap(fileChunks.size()); - for ( Map.Entry> fileChunksEntry : fileChunks.entrySet() ) { - List unmergedChunks = fileChunksEntry.getValue(); - mergedFileSpans.put(fileChunksEntry.getKey(), - (new GATKBAMFileSpan(unmergedChunks.toArray(new GATKChunk[unmergedChunks.size()]))).union(new GATKBAMFileSpan())); - } - - return new FilePointer(mergedFileSpans, mergeRule, sortedMergedLocations); - } - - /** - * Returns true if any of the file spans in this FilePointer overlap their counterparts in - * the other FilePointer. "Overlap" is defined as having an overlapping extent (the region - * from the start of the first chunk to the end of the last chunk). - * - * @param other the FilePointer against which to check overlap with this FilePointer - * @return true if any file spans overlap their counterparts in other, otherwise false - */ - public boolean hasFileSpansOverlappingWith( FilePointer other ) { - for ( Map.Entry thisFilePointerEntry : fileSpans.entrySet() ) { - GATKBAMFileSpan thisFileSpan = new GATKBAMFileSpan(thisFilePointerEntry.getValue()); - - SAMFileSpan otherEntry = other.fileSpans.get(thisFilePointerEntry.getKey()); - if ( otherEntry == null ) { - continue; // no counterpart for this file span in other - } - GATKBAMFileSpan otherFileSpan = new GATKBAMFileSpan(otherEntry); - - if ( thisFileSpan.getExtent().overlaps(otherFileSpan.getExtent()) ) { - return true; - } - } - - return false; - } - - @Override - public String toString() { - StringBuilder builder = new StringBuilder(); - builder.append("FilePointer:\n"); - builder.append("\tlocations = {"); - builder.append(Utils.join(";",locations)); - builder.append("}\n\tregions = \n"); - for(Map.Entry entry: fileSpans.entrySet()) { - builder.append(entry.getKey()); - builder.append("= {"); - builder.append(entry.getValue()); - builder.append("}"); - } - return builder.toString(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndex.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndex.java deleted file mode 100644 index 17afd5894..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndex.java +++ /dev/null @@ -1,468 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import htsjdk.samtools.Bin; -import htsjdk.samtools.GATKBin; -import htsjdk.samtools.GATKChunk; -import htsjdk.samtools.LinearIndex; -import htsjdk.samtools.seekablestream.SeekableBufferedStream; -import htsjdk.samtools.seekablestream.SeekableFileStream; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.io.File; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -/** - * A basic interface for querying BAM indices. - * Very much not thread-safe. - * - * @author mhanna - * @version 0.1 - */ -public class GATKBAMIndex { - /** - * BAM index file magic number. - */ - private static final byte[] BAM_INDEX_MAGIC = "BAI\1".getBytes(); - - /** - * Reports the total amount of genomic data that any bin can index. - */ - protected static final int BIN_GENOMIC_SPAN = 512*1024*1024; - - /** - * What is the starting bin for each level? - */ - private static final int[] LEVEL_STARTS = {0,1,9,73,585,4681}; - - /** - * Reports the maximum number of bins that can appear in a BAM file. - */ - public static final int MAX_BINS = 37450; // =(8^6-1)/7+1 - - private final File mFile; - - //TODO: figure out a good value for this buffer size - private final int BUFFERED_STREAM_BUFFER_SIZE = 8192; - - /** - * Number of sequences stored in this index. - */ - private final int sequenceCount; - - /** - * A cache of the starting positions of the sequences. - */ - private final long[] sequenceStartCache; - - private SeekableFileStream fileStream; - private SeekableBufferedStream bufferedStream; - private long fileLength; - - public GATKBAMIndex(final File file) { - mFile = file; - // Open the file stream. - openIndexFile(); - - // Verify the magic number. - seek(0); - final byte[] buffer = readBytes(4); - if (!Arrays.equals(buffer, BAM_INDEX_MAGIC)) { - throw new ReviewedGATKException("Invalid file header in BAM index " + mFile + - ": " + new String(buffer)); - } - - seek(4); - - sequenceCount = readInteger(); - - // Create a cache of the starting position of each sequence. Initialize it to -1. - sequenceStartCache = new long[sequenceCount]; - for(int i = 1; i < sequenceCount; i++) - sequenceStartCache[i] = -1; - - // Seed the first element in the array with the current position. - if(sequenceCount > 0) - sequenceStartCache[0] = position(); - - closeIndexFile(); - } - - public GATKBAMIndexData readReferenceSequence(final int referenceSequence) { - openIndexFile(); - - if (referenceSequence >= sequenceCount) - throw new ReviewedGATKException("Invalid sequence number " + referenceSequence + " in index file " + mFile); - - skipToSequence(referenceSequence); - - int binCount = readInteger(); - List bins = new ArrayList(); - for (int binNumber = 0; binNumber < binCount; binNumber++) { - final int indexBin = readInteger(); - final int nChunks = readInteger(); - - List chunks = new ArrayList(nChunks); - long[] rawChunkData = readLongs(nChunks*2); - for (int ci = 0; ci < nChunks; ci++) { - final long chunkBegin = rawChunkData[ci*2]; - final long chunkEnd = rawChunkData[ci*2+1]; - chunks.add(new GATKChunk(chunkBegin, chunkEnd)); - } - GATKBin bin = new GATKBin(referenceSequence, indexBin); - bin.setChunkList(chunks.toArray(new GATKChunk[chunks.size()])); - while(indexBin >= bins.size()) - bins.add(null); - bins.set(indexBin,bin); - } - - final int nLinearBins = readInteger(); - long[] linearIndexEntries = readLongs(nLinearBins); - - LinearIndex linearIndex = new LinearIndex(referenceSequence,0,linearIndexEntries); - - closeIndexFile(); - - return new GATKBAMIndexData(this,referenceSequence,bins,linearIndex); - } - - /** - * Get the number of levels employed by this index. - * @return Number of levels in this index. - */ - public static int getNumIndexLevels() { - return LEVEL_STARTS.length; - } - - /** - * Gets the first bin in the given level. - * @param levelNumber Level number. 0-based. - * @return The first bin in this level. - */ - public static int getFirstBinInLevel(final int levelNumber) { - return LEVEL_STARTS[levelNumber]; - } - - /** - * Gets the number of bins in the given level. - * @param levelNumber Level number. 0-based. - * @return The size (number of possible bins) of the given level. - */ - public int getLevelSize(final int levelNumber) { - if(levelNumber == getNumIndexLevels()-1) - return MAX_BINS-LEVEL_STARTS[levelNumber]-1; - else - return LEVEL_STARTS[levelNumber+1]-LEVEL_STARTS[levelNumber]; - } - - /** - * Gets the level associated with the given bin number. - * @param bin The bin for which to determine the level. - * @return the level associated with the given bin number. - */ - public int getLevelForBin(final Bin bin) { - GATKBin gatkBin = new GATKBin(bin); - if(gatkBin.getBinNumber() >= MAX_BINS) - throw new ReviewedGATKException("Tried to get level for invalid bin in index file " + mFile); - for(int i = getNumIndexLevels()-1; i >= 0; i--) { - if(gatkBin.getBinNumber() >= LEVEL_STARTS[i]) - return i; - } - throw new ReviewedGATKException("Unable to find correct bin for bin " + bin + " in index file " + mFile); - } - - /** - * Gets the first locus that this bin can index into. - * @param bin The bin to test. - * @return The last position that the given bin can represent. - */ - public int getFirstLocusInBin(final Bin bin) { - final int level = getLevelForBin(bin); - final int levelStart = LEVEL_STARTS[level]; - final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart; - return (new GATKBin(bin).getBinNumber() - levelStart)*(BIN_GENOMIC_SPAN /levelSize)+1; - } - - /** - * Gets the last locus that this bin can index into. - * @param bin The bin to test. - * @return The last position that the given bin can represent. - */ - public int getLastLocusInBin(final Bin bin) { - final int level = getLevelForBin(bin); - final int levelStart = LEVEL_STARTS[level]; - final int levelSize = ((level==getNumIndexLevels()-1) ? MAX_BINS-1 : LEVEL_STARTS[level+1]) - levelStart; - return (new GATKBin(bin).getBinNumber()-levelStart+1)*(BIN_GENOMIC_SPAN /levelSize); - } - - /** - * Use to get close to the unmapped reads at the end of a BAM file. - * @return The file offset of the first record in the last linear bin, or -1 - * if there are no elements in linear bins (i.e. no mapped reads). - */ - public long getStartOfLastLinearBin() { - openIndexFile(); - - seek(4); - - final int sequenceCount = readInteger(); - // Because no reads may align to the last sequence in the sequence dictionary, - // grab the last element of the linear index for each sequence, and return - // the last one from the last sequence that has one. - long lastLinearIndexPointer = -1; - for (int i = 0; i < sequenceCount; i++) { - // System.out.println("# Sequence TID: " + i); - final int nBins = readInteger(); - // System.out.println("# nBins: " + nBins); - for (int j1 = 0; j1 < nBins; j1++) { - // Skip bin # - skipBytes(4); - final int nChunks = readInteger(); - // Skip chunks - skipBytes(16 * nChunks); - } - final int nLinearBins = readInteger(); - if (nLinearBins > 0) { - // Skip to last element of list of linear bins - skipBytes(8 * (nLinearBins - 1)); - lastLinearIndexPointer = readLongs(1)[0]; - } - } - - closeIndexFile(); - - return lastLinearIndexPointer; - } - - /** - * Gets the possible number of bins for a given reference sequence. - * @return How many bins could possibly be used according to this indexing scheme to index a single contig. - */ - protected int getMaxAddressibleGenomicLocation() { - return BIN_GENOMIC_SPAN; - } - - protected void skipToSequence(final int referenceSequence) { - // Find the offset in the file of the last sequence whose position has been determined. Start here - // when searching the sequence for the next value to read. (Note that sequenceStartCache[0] will always - // be present, so no extra stopping condition is necessary. - int sequenceIndex = referenceSequence; - while(sequenceStartCache[sequenceIndex] == -1) - sequenceIndex--; - - // Advance to the most recently found position. - seek(sequenceStartCache[sequenceIndex]); - - for (int i = sequenceIndex; i < referenceSequence; i++) { - sequenceStartCache[i] = position(); - // System.out.println("# Sequence TID: " + i); - final int nBins = readInteger(); - // System.out.println("# nBins: " + nBins); - for (int j = 0; j < nBins; j++) { - final int bin = readInteger(); - final int nChunks = readInteger(); - // System.out.println("# bin[" + j + "] = " + bin + ", nChunks = " + nChunks); - skipBytes(16 * nChunks); - } - final int nLinearBins = readInteger(); - // System.out.println("# nLinearBins: " + nLinearBins); - skipBytes(8 * nLinearBins); - - } - - sequenceStartCache[referenceSequence] = position(); - } - - - - private void openIndexFile() { - try { - fileStream = new SeekableFileStream(mFile); - bufferedStream = new SeekableBufferedStream(fileStream,BUFFERED_STREAM_BUFFER_SIZE); - fileLength=bufferedStream.length(); - } - catch (IOException exc) { - throw new ReviewedGATKException("Unable to open index file (" + exc.getMessage() +")" + mFile, exc); - } - } - - private void closeIndexFile() { - try { - bufferedStream.close(); - fileStream.close(); - fileLength = -1; - } - catch (IOException exc) { - throw new ReviewedGATKException("Unable to close index file " + mFile, exc); - } - } - - private static final int INT_SIZE_IN_BYTES = Integer.SIZE / 8; - private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8; - - private byte[] readBytes(int count) { - ByteBuffer buffer = getBuffer(count); - read(buffer); - buffer.flip(); - byte[] contents = new byte[count]; - buffer.get(contents); - return contents; - } - - private int readInteger() { - ByteBuffer buffer = getBuffer(INT_SIZE_IN_BYTES); - read(buffer); - buffer.flip(); - return buffer.getInt(); - } - - /** - * Reads an array of longs from the file channel, returning the results as an array. - * @param count Number of longs to read. - * @return An array of longs. Size of array should match count. - */ - private long[] readLongs(final int count) { - ByteBuffer buffer = getBuffer(count*LONG_SIZE_IN_BYTES); - read(buffer); - buffer.flip(); - long[] result = new long[count]; - for(int i = 0; i < count; i++) - result[i] = buffer.getLong(); - return result; - } - - private void read(final ByteBuffer buffer) { - final int bytesRequested = buffer.limit(); - - try { - - //BufferedInputStream cannot read directly into a byte buffer, so we read into an array - //and put the result into the bytebuffer after the if statement. - - // We have a rigid expectation here to read in exactly the number of bytes we've limited - // our buffer to -- if there isn't enough data in the file, the index - // must be truncated or otherwise corrupt: - if(bytesRequested > fileLength - bufferedStream.position()){ - throw new UserException.MalformedFile(mFile, String.format("Premature end-of-file while reading BAM index file %s. " + - "It's likely that this file is truncated or corrupt -- " + - "Please try re-indexing the corresponding BAM file.", - mFile)); - } - - int totalBytesRead = 0; - // This while loop must terminate since we demand that we read at least one byte from the file at each iteration - while (totalBytesRead < bytesRequested) { - // bufferedStream.read may return less than the requested amount of byte despite - // not reaching the end of the file, hence the loop. - int bytesRead = bufferedStream.read(byteArray, totalBytesRead, bytesRequested-totalBytesRead); - - // We have a rigid expectation here to read in exactly the number of bytes we've limited - // our buffer to -- if we encounter EOF (-1), the index - // must be truncated or otherwise corrupt: - if (bytesRead <= 0) { - throw new UserException.MalformedFile(mFile, String.format("Premature end-of-file while reading BAM index file %s. " + - "It's likely that this file is truncated or corrupt -- " + - "Please try re-indexing the corresponding BAM file.", - mFile)); - } - totalBytesRead += bytesRead; - } - if(totalBytesRead != bytesRequested) - throw new RuntimeException("Read amount different from requested amount. This should not happen."); - - buffer.put(byteArray, 0, bytesRequested); - } - catch(IOException ex) { - throw new ReviewedGATKException("Index: unable to read bytes from index file " + mFile); - } - } - - - /** - * A reusable buffer for use by this index generator. - * TODO: Should this be a SoftReference? - */ - private ByteBuffer buffer = null; - - //BufferedStream don't read into ByteBuffers, so we need this temporary array - private byte[] byteArray=null; - private ByteBuffer getBuffer(final int size) { - if(buffer == null || buffer.capacity() < size) { - // Allocate a new byte buffer. For now, make it indirect to make sure it winds up on the heap for easier debugging. - buffer = ByteBuffer.allocate(size); - byteArray = new byte[size]; - buffer.order(ByteOrder.LITTLE_ENDIAN); - } - buffer.clear(); - buffer.limit(size); - return buffer; - } - - private void skipBytes(final int count) { - try { - - //try to skip forward the requested amount. - long skipped = bufferedStream.skip(count); - - if( skipped != count ) { //if not managed to skip the requested amount - throw new ReviewedGATKException("Index: unable to reposition file channel of index file " + mFile); - } - } - catch(IOException ex) { - throw new ReviewedGATKException("Index: unable to reposition file channel of index file " + mFile); - } - } - - private void seek(final long position) { - try { - //to seek a new position, move the fileChannel, and reposition the bufferedStream - bufferedStream.seek(position); - } - catch(IOException ex) { - throw new ReviewedGATKException("Index: unable to reposition of file channel of index file " + mFile); - } - } - - /** - * Retrieve the position from the current file channel. - * @return position of the current file channel. - */ - private long position() { - try { - return bufferedStream.position(); - } - catch (IOException exc) { - throw new ReviewedGATKException("Unable to read position from index file " + mFile, exc); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/LocusShard.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/LocusShard.java deleted file mode 100644 index 28d4faf2c..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/LocusShard.java +++ /dev/null @@ -1,60 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import htsjdk.samtools.SAMFileSpan; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.Utils; - -import java.util.List; -import java.util.Map; - -/** - * Handles locus shards of BAM information. - * @author aaron - * @version 1.0 - * @date Apr 7, 2009 - */ -public class LocusShard extends Shard { - /** - * Create a new locus shard, divided by index. - * @param intervals List of intervals to process. - * @param fileSpans File spans associated with that interval. - */ - public LocusShard(GenomeLocParser parser, SAMDataSource dataSource, List intervals, Map fileSpans) { - super(parser, ShardType.LOCUS, intervals, dataSource, fileSpans, false); - } - - /** - * String representation of this shard. - * @return A string representation of the boundaries of this shard. - */ - @Override - public String toString() { - return Utils.join(";",getGenomeLocs()); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShard.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShard.java deleted file mode 100644 index d4321da3b..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShard.java +++ /dev/null @@ -1,271 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import htsjdk.samtools.util.PeekableIterator; -import htsjdk.samtools.*; -import htsjdk.samtools.util.CloseableIterator; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIteratorAdapter; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.util.*; - -/** - * - * User: aaron - * Date: Apr 10, 2009 - * Time: 5:03:13 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - -/** - * Expresses a shard of read data in block format. - * - * @author mhanna - * @version 0.1 - */ -public class ReadShard extends Shard { - - /** - * Default read shard buffer size - */ - public static final int DEFAULT_MAX_READS = 10000; - - /** - * What is the maximum number of reads per BAM file which should go into a read shard. - * - * TODO: this non-final static variable should either be made final or turned into an - * TODO: instance variable somewhere -- as both static and mutable it wreaks havoc - * TODO: with tests that use multiple instances of SAMDataSource (since SAMDataSource - * TODO: changes this value) - */ - public static int MAX_READS = DEFAULT_MAX_READS; - - /** - * The reads making up this shard. - */ - private final Collection reads = new ArrayList(MAX_READS); - - public ReadShard(GenomeLocParser parser, SAMDataSource readsDataSource, Map fileSpans, List loci, boolean isUnmapped) { - super(parser, ShardType.READ, loci, readsDataSource, fileSpans, isUnmapped); - } - - /** - * Sets the maximum number of reads buffered in a read shard. Implemented as a weirdly static interface - * until we know what effect tuning this parameter has. - * - * TODO: this mutable static interface is awful and breaks tests -- need to refactor - * - * @param bufferSize New maximum number - */ - static void setReadBufferSize(final int bufferSize) { - MAX_READS = bufferSize; - } - - /** - * What read buffer size are we using? - * - * @return - */ - public static int getReadBufferSize() { - return MAX_READS; - } - - /** - * Returns true if this shard is meant to buffer reads, rather - * than just holding pointers to their locations. - * @return True if this shard can buffer reads. False otherwise. - */ - public boolean buffersReads() { - return true; - } - - /** - * Returns true if the read buffer is currently full. - * @return True if this shard's buffer is full (and the shard can buffer reads). - */ - public boolean isBufferEmpty() { - return reads.size() == 0; - } - - /** - * Returns true if the read buffer is currently full. - * @return True if this shard's buffer is full (and the shard can buffer reads). - */ - public boolean isBufferFull() { - return reads.size() > ReadShard.MAX_READS; - } - - /** - * Adds a read to the read buffer. - * @param read Add a read to the internal shard buffer. - */ - public void addRead(SAMRecord read) { - // DO NOT validate that the buffer is full. Paired read sharding will occasionally have to stuff another - // read or two into the buffer. - reads.add(read); - } - - /** - * Fills this shard's buffer with reads from the iterator passed in - * - * @param readIter Iterator from which to draw the reads to fill the shard - */ - @Override - public void fill( PeekableIterator readIter ) { - if( ! buffersReads() ) - throw new ReviewedGATKException("Attempting to fill a non-buffering shard."); - - SAMFileHeader.SortOrder sortOrder = getReadProperties().getSortOrder(); - SAMRecord read = null; - - while( ! isBufferFull() && readIter.hasNext() ) { - final SAMRecord nextRead = readIter.peek(); - if ( read == null || (nextRead.getReferenceIndex().equals(read.getReferenceIndex())) ) { - // only add reads to the shard if they are on the same contig - read = readIter.next(); - addRead(read); - } else { - break; - } - } - - // If the reads are sorted in coordinate order, ensure that all reads - // having the same alignment start become part of the same shard, to allow - // downsampling to work better across shard boundaries. Note that because our - // read stream has already been fed through the positional downsampler, which - // ensures that at each alignment start position there are no more than dcov - // reads, we're in no danger of accidentally creating a disproportionately huge - // shard - if ( sortOrder == SAMFileHeader.SortOrder.coordinate ) { - while ( readIter.hasNext() ) { - SAMRecord additionalRead = readIter.peek(); - - // Stop filling the shard as soon as we encounter a read having a different - // alignment start or contig from the last read added in the earlier loop - // above, or an unmapped read - if ( read == null || - additionalRead.getReadUnmappedFlag() || - ! additionalRead.getReferenceIndex().equals(read.getReferenceIndex()) || - additionalRead.getAlignmentStart() != read.getAlignmentStart() ) { - break; - } - - addRead(readIter.next()); - } - } - - // If the reads are sorted in queryname order, ensure that all reads - // having the same queryname become part of the same shard. - if( sortOrder == SAMFileHeader.SortOrder.queryname ) { - while( readIter.hasNext() ) { - SAMRecord nextRead = readIter.peek(); - if( read == null || ! read.getReadName().equals(nextRead.getReadName()) ) - break; - addRead(readIter.next()); - } - } - } - - /** - * Creates an iterator over reads stored in this shard's read cache. - * @return - */ - public GATKSAMIterator iterator() { - return GATKSAMIteratorAdapter.adapt(reads.iterator()); - } - - /** - * String representation of this shard. - * @return A string representation of the boundaries of this shard. - */ - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - for(Map.Entry entry: getFileSpans().entrySet()) { - sb.append(entry.getKey()); - sb.append(": "); - sb.append(entry.getValue()); - sb.append(' '); - } - return sb.toString(); - } - - /** - * Get the full span from the start of the left most read to the end of the right most one - * - * Note this may be different than the getLocation() of the shard, as this reflects the - * targeted span, not the actual span of reads - * - * @return the genome loc representing the span of these reads on the genome - */ - public GenomeLoc getReadsSpan() { - if ( isUnmapped() || super.getGenomeLocs() == null || reads.isEmpty() ) - return super.getLocation(); - else { - int start = Integer.MAX_VALUE; - int stop = Integer.MIN_VALUE; - String contig = null; - boolean foundMapped = false; - - for ( final SAMRecord read : reads ) { - if ( contig != null && ! read.getReferenceName().equals(contig) ) - throw new ReviewedGATKException("ReadShard contains reads spanning contig boundaries, which is no longer allowed. " - + "First contig is " + contig + " next read was " + read.getReferenceName() ); - contig = read.getReferenceName(); - - // Even if this shard as a *whole* is not "unmapped", we can still encounter *individual* unmapped mates - // of mapped reads within this shard's buffer. In fact, if we're very unlucky with shard boundaries, - // this shard might consist *only* of unmapped mates! We need to refrain from using the alignment - // starts/stops of these unmapped mates, and detect the case where the shard has been filled *only* - // with unmapped mates. - if ( ! read.getReadUnmappedFlag() ) { - foundMapped = true; - if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart(); - if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd(); - } - } - - assert contig != null; - - if ( ! foundMapped || contig.equals("*") ) // all reads are unmapped - return GenomeLoc.UNMAPPED; - else - return parser.createGenomeLoc(contig, start, stop); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java deleted file mode 100644 index 0fc06fcce..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java +++ /dev/null @@ -1,1179 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import htsjdk.samtools.MergingSamRecordIterator; -import htsjdk.samtools.SamFileHeaderMerger; -import htsjdk.samtools.*; -import htsjdk.samtools.util.CloseableIterator; -import htsjdk.samtools.util.RuntimeIOException; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.ReadMetrics; -import org.broadinstitute.gatk.engine.ReadProperties; -import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; -import org.broadinstitute.gatk.engine.downsampling.*; -import org.broadinstitute.gatk.engine.filters.CountingFilteringIterator; -import org.broadinstitute.gatk.engine.filters.ReadFilter; -import org.broadinstitute.gatk.engine.iterators.*; -import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.GenomeLocSortedSet; -import org.broadinstitute.gatk.utils.SimpleTimer; -import org.broadinstitute.gatk.utils.baq.ReadTransformingIterator; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; -import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.gatk.utils.sam.GATKSamRecordFactory; - -import java.io.File; -import java.lang.reflect.InvocationTargetException; -import java.lang.reflect.Method; -import java.util.*; -import java.util.concurrent.Callable; - -/** - * User: aaron - * Date: Mar 26, 2009 - * Time: 2:36:16 PM - *

    - * Converts shards to SAM iterators over the specified region - */ -public class SAMDataSource { - final private static GATKSamRecordFactory factory = new GATKSamRecordFactory(); - - /** Backing support for reads. */ - protected final ReadProperties readProperties; - - /** - * Runtime metrics of reads filtered, etc. - */ - private final ReadMetrics readMetrics; - - /** - * Tools for parsing GenomeLocs, for verifying BAM ordering against general ordering. - */ - protected final GenomeLocParser genomeLocParser; - - /** - * Identifiers for the readers driving this data source. - */ - private final Collection readerIDs; - - /** - * How strict are the readers driving this data source. - */ - private final ValidationStringency validationStringency; - - /** - * Do we want to remove the program records from this data source? - */ - private final boolean removeProgramRecords; - - /** - * Store BAM indices for each reader present. - */ - private final Map bamIndices = new HashMap(); - - /** - * The merged header. - */ - private final SAMFileHeader mergedHeader; - - /** - * The constituent headers of the unmerged files. - */ - private final Map headers = new HashMap(); - - /** - * The sort order of the BAM files. Files without a sort order tag are assumed to be - * in coordinate order. - */ - private SAMFileHeader.SortOrder sortOrder = null; - - /** - * Whether the read groups in overlapping files collide. - */ - private final boolean hasReadGroupCollisions; - - /** - * Maps the SAM readers' merged read group ids to their original ids. Since merged read group ids - * are always unique, we can simply use a map here, no need to stratify by reader. - */ - private final ReadGroupMapping mergedToOriginalReadGroupMappings = new ReadGroupMapping(); - - /** - * Maps the SAM readers' original read group ids to their revised ids. This mapping must be stratified - * by readers, since there can be readgroup id collision: different bam files (readers) can list the - * same read group id, which will be disambiguated when these input streams are merged. - */ - private final Map originalToMergedReadGroupMappings = new HashMap(); - - /** - * Mapping from input file path to new sample name. Used only when doing on-the-fly sample renaming. - */ - private Map sampleRenameMap = null; - - /** our log, which we want to capture anything from this class */ - private static Logger logger = Logger.getLogger(SAMDataSource.class); - - /** - * A collection of readers driving the merging process. - */ - private final SAMResourcePool resourcePool; - - /** - * Asynchronously loads BGZF blocks. - */ - private final BGZFBlockLoadingDispatcher dispatcher; - - /** - * How are threads allocated. - */ - private final ThreadAllocation threadAllocation; - - /** - * How are adjacent intervals merged by the sharder? - */ - private final IntervalMergingRule intervalMergingRule; - - /** - * Static set of unsupported programs that create bam files. - * The key is the PG record ID and the value is the name of the tool that created it - */ - private static Map unsupportedPGs = new HashMap<>(); - static { - unsupportedPGs.put("GATK ReduceReads", "ReduceReads"); - } - - /** - * Create a new SAM data source given the supplied read metadata. - * - * For testing purposes - * - * @param samFiles list of reads files. - */ - public SAMDataSource(Collection samFiles, ThreadAllocation threadAllocation, Integer numFileHandles, GenomeLocParser genomeLocParser) { - this( - samFiles, - threadAllocation, - numFileHandles, - genomeLocParser, - false, - ValidationStringency.STRICT, - null, - null, - new ValidationExclusion(), - new ArrayList(), - false); - } - - /** - * See complete constructor. Does not enable BAQ by default. - * - * For testing purposes - */ - public SAMDataSource( - Collection samFiles, - ThreadAllocation threadAllocation, - Integer numFileHandles, - GenomeLocParser genomeLocParser, - boolean useOriginalBaseQualities, - ValidationStringency strictness, - Integer readBufferSize, - DownsamplingMethod downsamplingMethod, - ValidationExclusion exclusionList, - Collection supplementalFilters, - boolean includeReadsWithDeletionAtLoci) { - this( samFiles, - threadAllocation, - numFileHandles, - genomeLocParser, - useOriginalBaseQualities, - strictness, - readBufferSize, - downsamplingMethod, - exclusionList, - supplementalFilters, - Collections.emptyList(), - includeReadsWithDeletionAtLoci, - (byte) -1, - false, - false, - null, - IntervalMergingRule.ALL); - } - - /** - * Create a new SAM data source given the supplied read metadata. - * @param samFiles list of reads files. - * @param useOriginalBaseQualities True if original base qualities should be used. - * @param strictness Stringency of reads file parsing. - * @param readBufferSize Number of reads to hold in memory per BAM. - * @param downsamplingMethod Method for downsampling reads at a given locus. - * @param exclusionList what safety checks we're willing to let slide - * @param supplementalFilters additional filters to dynamically apply. - * @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method - * will explicitly list reads with deletion over the current reference base; otherwise, only observed - * bases will be seen in the pileups, and the deletions will be skipped silently. - * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. - * @param keepReadsInLIBS should we keep a unique list of reads in LIBS? - * @param sampleRenameMap Map of BAM file to new sample ID used during on-the-fly runtime sample renaming. - * Will be null if we're not doing sample renaming. - * @param intervalMergingRule how are adjacent intervals merged by the sharder - */ - public SAMDataSource( - Collection samFiles, - ThreadAllocation threadAllocation, - Integer numFileHandles, - GenomeLocParser genomeLocParser, - boolean useOriginalBaseQualities, - ValidationStringency strictness, - Integer readBufferSize, - DownsamplingMethod downsamplingMethod, - ValidationExclusion exclusionList, - Collection supplementalFilters, - List readTransformers, - boolean includeReadsWithDeletionAtLoci, - byte defaultBaseQualities, - boolean removeProgramRecords, - final boolean keepReadsInLIBS, - final Map sampleRenameMap, - final IntervalMergingRule intervalMergingRule) { - - this.readMetrics = new ReadMetrics(); - this.genomeLocParser = genomeLocParser; - this.intervalMergingRule = intervalMergingRule; - - readerIDs = samFiles; - - this.threadAllocation = threadAllocation; - // TODO: Consider a borrowed-thread dispatcher implementation. - if(this.threadAllocation.getNumIOThreads() > 0) { - logger.info("Running in asynchronous I/O mode; number of threads = " + this.threadAllocation.getNumIOThreads()); - dispatcher = new BGZFBlockLoadingDispatcher(this.threadAllocation.getNumIOThreads(), numFileHandles != null ? numFileHandles : 1); - } - else - dispatcher = null; - - validationStringency = strictness; - this.removeProgramRecords = removeProgramRecords; - if(readBufferSize != null) - ReadShard.setReadBufferSize(readBufferSize); // TODO: use of non-final static variable here is just awful, especially for parallel tests - else { - // Choose a sensible default for the read buffer size. - // Previously we we're picked 100000 reads per BAM per shard with a max cap of 250K reads in memory at once. - // Now we are simply setting it to 100K reads - ReadShard.setReadBufferSize(100000); - } - - this.sampleRenameMap = sampleRenameMap; - - resourcePool = new SAMResourcePool(Integer.MAX_VALUE); - SAMReaders readers = resourcePool.getAvailableReaders(); - - // Determine the sort order. - for(SAMReaderID readerID: readerIDs) { - if (! readerID.samFile.canRead() ) - throw new UserException.CouldNotReadInputFile(readerID.samFile,"file is not present or user does not have appropriate permissions. " + - "Please check that the file is present and readable and try again."); - - // Get the sort order, forcing it to coordinate if unsorted. - SAMFileReader reader = readers.getReader(readerID); - SAMFileHeader header = reader.getFileHeader(); - - headers.put(readerID,header); - - if ( header.getReadGroups().isEmpty() ) { - throw new UserException.MalformedBAM(readers.getReaderID(reader).samFile, - "SAM file doesn't have any read groups defined in the header. The GATK no longer supports SAM files without read groups"); - } - - SAMFileHeader.SortOrder sortOrder = header.getSortOrder() != SAMFileHeader.SortOrder.unsorted ? header.getSortOrder() : SAMFileHeader.SortOrder.coordinate; - - // Validate that all input files are sorted in the same order. - if(this.sortOrder != null && this.sortOrder != sortOrder) - throw new UserException.MissortedBAM(String.format("Attempted to process mixed of files sorted as %s and %s.",this.sortOrder,sortOrder)); - - // Update the sort order. - this.sortOrder = sortOrder; - } - - mergedHeader = readers.getMergedHeader(); - hasReadGroupCollisions = readers.hasReadGroupCollisions(); - - readProperties = new ReadProperties( - samFiles, - mergedHeader, - sortOrder, - useOriginalBaseQualities, - strictness, - downsamplingMethod, - exclusionList, - supplementalFilters, - readTransformers, - includeReadsWithDeletionAtLoci, - defaultBaseQualities, - keepReadsInLIBS); - - // cache the read group id (original) -> read group id (merged) - // and read group id (merged) -> read group id (original) mappings. - for(SAMReaderID id: readerIDs) { - SAMFileReader reader = readers.getReader(id); - - ReadGroupMapping mappingToMerged = new ReadGroupMapping(); - - List readGroups = reader.getFileHeader().getReadGroups(); - for(SAMReadGroupRecord readGroup: readGroups) { - if(hasReadGroupCollisions) { - mappingToMerged.put(readGroup.getReadGroupId(),readers.getReadGroupId(id,readGroup.getReadGroupId())); - mergedToOriginalReadGroupMappings.put(readers.getReadGroupId(id,readGroup.getReadGroupId()),readGroup.getReadGroupId()); - } else { - mappingToMerged.put(readGroup.getReadGroupId(),readGroup.getReadGroupId()); - mergedToOriginalReadGroupMappings.put(readGroup.getReadGroupId(),readGroup.getReadGroupId()); - } - } - - originalToMergedReadGroupMappings.put(id,mappingToMerged); - } - - for(SAMReaderID id: readerIDs) { - File indexFile = findIndexFile(id.samFile); - if(indexFile != null) - bamIndices.put(id,new GATKBAMIndex(indexFile)); - } - - resourcePool.releaseReaders(readers); - } - - /** - * Checks whether the provided SAM header if from a reduced bam file. - * @param header the SAM header for a given file - * @throws UserException if the header is from a reduced bam - */ - private void checkForUnsupportedBamFile(final SAMFileHeader header) { - for ( final SAMProgramRecord PGrecord : header.getProgramRecords() ) { - if ( unsupportedPGs.containsKey(PGrecord.getId()) ) - throw new UserException("The GATK no longer supports running off of BAMs produced by " + unsupportedPGs.get(PGrecord.getId())); - } - } - - public void close() { - SAMReaders readers = resourcePool.getAvailableReaders(); - for(SAMReaderID readerID: readerIDs) { - SAMFileReader reader = readers.getReader(readerID); - reader.close(); - } - } - - /** - * Returns Reads data structure containing information about the reads data sources placed in this pool as well as - * information about how they are downsampled, sorted, and filtered - * @return - */ - public ReadProperties getReadsInfo() { return readProperties; } - - /** - * Checks to see whether any reads files are supplying data. - * @return True if no reads files are supplying data to the traversal; false otherwise. - */ - public boolean isEmpty() { - return readProperties.getSAMReaderIDs().size() == 0; - } - - /** - * Gets the SAM file associated with a given reader ID. - * @param id The reader for which to retrieve the source file. - * @return the file actually associated with the id. - */ - public File getSAMFile(SAMReaderID id) { - return id.samFile; - } - - /** - * Returns readers used by this data source. - * @return A list of SAM reader IDs. - */ - public Collection getReaderIDs() { - return readerIDs; - } - - /** - * Retrieves the id of the reader which built the given read. - * @param read The read to test. - * @return ID of the reader. - */ - public SAMReaderID getReaderID(SAMRecord read) { - return resourcePool.getReaderID(read.getFileSource().getReader()); - } - - /** - * Gets the merged header from the SAM file. - * @return The merged header. - */ - public SAMFileHeader getHeader() { - return mergedHeader; - } - - public SAMFileHeader getHeader(SAMReaderID id) { - return headers.get(id); - } - - /** - * Gets the revised read group id mapped to this 'original' read group id. - * @param reader for which to grab a read group. - * @param originalReadGroupId ID of the original read group. - * @return Merged read group ID. - */ - public String getReadGroupId(final SAMReaderID reader, final String originalReadGroupId) { - return originalToMergedReadGroupMappings.get(reader).get(originalReadGroupId); - } - - /** - * Gets the original read group id (as it was specified in the original input bam file) that maps onto - * this 'merged' read group id. - * @param mergedReadGroupId 'merged' ID of the read group (as it is presented by the read received from merged input stream). - * @return Merged read group ID. - */ - public String getOriginalReadGroupId(final String mergedReadGroupId) { - return mergedToOriginalReadGroupMappings.get(mergedReadGroupId); - } - - /** - * True if all readers have an index. - * @return True if all readers have an index. - */ - public boolean hasIndex() { - return readerIDs.size() == bamIndices.size(); - } - - /** - * Gets the index for a particular reader. Always preloaded. - * @param id Id of the reader. - * @return The index. Will preload the index if necessary. - */ - public GATKBAMIndex getIndex(final SAMReaderID id) { - return bamIndices.get(id); - } - - /** - * Retrieves the sort order of the readers. - * @return Sort order. Can be unsorted, coordinate order, or query name order. - */ - public SAMFileHeader.SortOrder getSortOrder() { - return sortOrder; - } - - /** - * Gets the cumulative read metrics for shards already processed. - * @return Cumulative read metrics. - */ - public ReadMetrics getCumulativeReadMetrics() { - // don't return a clone here because the engine uses a pointer to this object - return readMetrics; - } - - /** - * Incorporate the given read metrics into the cumulative read metrics. - * @param readMetrics The 'incremental' read metrics, to be incorporated into the cumulative metrics. - */ - public void incorporateReadMetrics(final ReadMetrics readMetrics) { - this.readMetrics.incrementMetrics(readMetrics); - } - - public GATKSAMIterator seek(Shard shard) { - if(shard.buffersReads()) { - return shard.iterator(); - } - else { - return getIterator(shard); - } - } - - /** - * Gets the reader associated with the given read. - * @param readers Available readers. - * @param read - * @return - */ - private SAMReaderID getReaderID(SAMReaders readers, SAMRecord read) { - for(SAMReaderID id: getReaderIDs()) { - if(readers.getReader(id) == read.getFileSource().getReader()) - return id; - } - throw new ReviewedGATKException("Unable to find id for reader associated with read " + read.getReadName()); - } - - /** - * Get the initial reader positions across all BAM files - * - * @return the start positions of the first chunk of reads for all BAM files - */ - protected Map getInitialReaderPositions() { - Map initialPositions = new HashMap(); - SAMReaders readers = resourcePool.getAvailableReaders(); - - for ( SAMReaderID id: getReaderIDs() ) { - initialPositions.put(id, new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads())); - } - - resourcePool.releaseReaders(readers); - return initialPositions; - } - - /** - * Get an iterator over the data types specified in the shard. - * - * @param shard The shard specifying the data limits. - * @return An iterator over the selected data. - */ - protected GATKSAMIterator getIterator( Shard shard ) { - return getIterator(resourcePool.getAvailableReaders(), shard, shard instanceof ReadShard); - } - - /** - * Get an iterator over the data types specified in the shard. - * @param readers Readers from which to load data. - * @param shard The shard specifying the data limits. - * @param enableVerification True to verify. For compatibility with old sharding strategy. - * @return An iterator over the selected data. - */ - private GATKSAMIterator getIterator(SAMReaders readers, Shard shard, boolean enableVerification) { - // Set up merging to dynamically merge together multiple BAMs. - Map> iteratorMap = new HashMap>(); - - for(SAMReaderID id: getReaderIDs()) { - CloseableIterator iterator = null; - - // TODO: null used to be the signal for unmapped, but we've replaced that with a simple index query for the last bin. - // TODO: Kill this check once we've proven that the design elements are gone. - if(shard.getFileSpans().get(id) == null) - throw new ReviewedGATKException("SAMDataSource: received null location for reader " + id + ", but null locations are no longer supported."); - - try { - if(threadAllocation.getNumIOThreads() > 0) { - BlockInputStream inputStream = readers.getInputStream(id); - inputStream.submitAccessPlan(new BAMAccessPlan(id, inputStream, (GATKBAMFileSpan) shard.getFileSpans().get(id))); - BAMRecordCodec codec = new BAMRecordCodec(getHeader(id),factory); - codec.setInputStream(inputStream); - iterator = new BAMCodecIterator(inputStream,readers.getReader(id),codec); - } - else { - iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); - } - } catch ( RuntimeException e ) { // we need to catch RuntimeExceptions here because the Picard code is throwing them (among SAMFormatExceptions) sometimes - throw new UserException.MalformedBAM(id.samFile, e.getMessage()); - } - - iterator = new MalformedBAMErrorReformatingIterator(id.samFile, iterator); - if(shard.getGenomeLocs().size() > 0) - iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs()); - - iteratorMap.put(readers.getReader(id), iterator); - } - - MergingSamRecordIterator mergingIterator = readers.createMergingIterator(iteratorMap); - - // The readMetrics object being passed in should be that of this dataSource and NOT the shard: the dataSource's - // metrics is intended to keep track of the reads seen (and hence passed to the CountingFilteringIterator when - // we apply the decorators), whereas the shard's metrics is used to keep track the "records" seen. - return applyDecoratingIterators(readMetrics, - enableVerification, - readProperties.useOriginalBaseQualities(), - new ReleasingIterator(readers,GATKSAMIteratorAdapter.adapt(mergingIterator)), - readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), - readProperties.getSupplementalFilters(), - readProperties.getReadTransformers(), - readProperties.defaultBaseQualities(), - shard instanceof LocusShard); - } - - private class BAMCodecIterator implements CloseableIterator { - private final BlockInputStream inputStream; - private final SAMFileReader reader; - private final BAMRecordCodec codec; - private SAMRecord nextRead; - - private BAMCodecIterator(final BlockInputStream inputStream, final SAMFileReader reader, final BAMRecordCodec codec) { - this.inputStream = inputStream; - this.reader = reader; - this.codec = codec; - advance(); - } - - public boolean hasNext() { - return nextRead != null; - } - - public SAMRecord next() { - if(!hasNext()) - throw new NoSuchElementException("Unable to retrieve next record from BAMCodecIterator; input stream is empty"); - SAMRecord currentRead = nextRead; - advance(); - return currentRead; - } - - public void close() { - // NO-OP. - } - - public void remove() { - throw new UnsupportedOperationException("Unable to remove from BAMCodecIterator"); - } - - private void advance() { - final long startCoordinate = inputStream.getFilePointer(); - nextRead = codec.decode(); - final long stopCoordinate = inputStream.getFilePointer(); - - if(reader != null && nextRead != null) - PicardNamespaceUtils.setFileSource(nextRead, new SAMFileSource(reader, new GATKBAMFileSpan(new GATKChunk(startCoordinate, stopCoordinate)))); - } - } - - /** - * Filter reads based on user-specified criteria. - * - * @param readMetrics metrics to track when using this iterator. - * @param enableVerification Verify the order of reads. - * @param useOriginalBaseQualities True if original base qualities should be used. - * @param wrappedIterator the raw data source. - * @param noValidationOfReadOrder Another trigger for the verifying iterator? TODO: look into this. - * @param supplementalFilters additional filters to apply to the reads. - * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. - * @param isLocusBasedTraversal true if we're dealing with a read stream from a LocusShard - * @return An iterator wrapped with filters reflecting the passed-in parameters. Will not be null. - */ - protected GATKSAMIterator applyDecoratingIterators(ReadMetrics readMetrics, - boolean enableVerification, - boolean useOriginalBaseQualities, - GATKSAMIterator wrappedIterator, - Boolean noValidationOfReadOrder, - Collection supplementalFilters, - List readTransformers, - byte defaultBaseQualities, - boolean isLocusBasedTraversal ) { - - // Always apply the ReadFormattingIterator before both ReadFilters and ReadTransformers. At a minimum, - // this will consolidate the cigar strings into canonical form. This has to be done before the read - // filtering, because not all read filters will behave correctly with things like zero-length cigar - // elements. If useOriginalBaseQualities is true or defaultBaseQualities >= 0, this iterator will also - // modify the base qualities. - wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities); - - // Read Filters: these are applied BEFORE downsampling, so that we downsample within the set of reads - // that actually survive filtering. Otherwise we could get much less coverage than requested. - wrappedIterator = GATKSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters)); - - // Downsampling: - - // For locus traversals where we're downsampling to coverage by sample, assume that the downsamplers - // will be invoked downstream from us in LocusIteratorByState. This improves performance by avoiding - // splitting/re-assembly of the read stream at this stage, and also allows for partial downsampling - // of individual reads. - boolean assumeDownstreamLIBSDownsampling = isLocusBasedTraversal && - readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && - readProperties.getDownsamplingMethod().toCoverage != null; - - // Apply downsampling iterators here only in cases where we know that LocusIteratorByState won't be - // doing any downsampling downstream of us - if ( ! assumeDownstreamLIBSDownsampling ) { - wrappedIterator = applyDownsamplingIterator(wrappedIterator); - } - - // unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification, - // verify the read ordering by applying a sort order iterator - if (!noValidationOfReadOrder && enableVerification) - wrappedIterator = new VerifyingSamIterator(wrappedIterator); - - // Read transformers: these are applied last, so that we don't bother transforming reads that get discarded - // by the read filters or downsampler. - for ( final ReadTransformer readTransformer : readTransformers ) { - if ( readTransformer.enabled() && readTransformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_INPUT ) - wrappedIterator = new ReadTransformingIterator(wrappedIterator, readTransformer); - } - - return wrappedIterator; - } - - protected GATKSAMIterator applyDownsamplingIterator( GATKSAMIterator wrappedIterator ) { - if ( readProperties.getDownsamplingMethod() == null || - readProperties.getDownsamplingMethod().type == DownsampleType.NONE ) { - return wrappedIterator; - } - - if ( readProperties.getDownsamplingMethod().toFraction != null ) { - - // If we're downsampling to a fraction of reads, there's no point in paying the cost of - // splitting/re-assembling the read stream by sample to run the FractionalDownsampler on - // reads from each sample separately, since the result would be the same as running the - // FractionalDownsampler on the entire stream. So, ALWAYS use the DownsamplingReadsIterator - // rather than the PerSampleDownsamplingReadsIterator, even if BY_SAMPLE downsampling - // was requested. - - return new DownsamplingReadsIterator(wrappedIterator, - new FractionalDownsampler(readProperties.getDownsamplingMethod().toFraction)); - } - else if ( readProperties.getDownsamplingMethod().toCoverage != null ) { - - // If we're downsampling to coverage, we DO need to pay the cost of splitting/re-assembling - // the read stream to run the downsampler on the reads for each individual sample separately if - // BY_SAMPLE downsampling was requested. - - if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) { - return new PerSampleDownsamplingReadsIterator(wrappedIterator, - new SimplePositionalDownsamplerFactory(readProperties.getDownsamplingMethod().toCoverage)); - } - else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) { - return new DownsamplingReadsIterator(wrappedIterator, - new SimplePositionalDownsampler(readProperties.getDownsamplingMethod().toCoverage)); - } - } - - return wrappedIterator; - } - - - private class SAMResourcePool { - /** - * How many entries can be cached in this resource pool? - */ - private final int maxEntries; - - /** - * All iterators of this reference-ordered data. - */ - private List allResources = new ArrayList(); - - /** - * All iterators that are not currently in service. - */ - private List availableResources = new ArrayList(); - - public SAMResourcePool(final int maxEntries) { - this.maxEntries = maxEntries; - } - - /** - * Choose a set of readers from the pool to use for this query. When complete, - * @return - */ - public synchronized SAMReaders getAvailableReaders() { - if(availableResources.size() == 0) - createNewResource(); - SAMReaders readers = availableResources.get(0); - availableResources.remove(readers); - return readers; - } - - public synchronized void releaseReaders(SAMReaders readers) { - if(!allResources.contains(readers)) - throw new ReviewedGATKException("Tried to return readers from the pool that didn't originate in the pool."); - availableResources.add(readers); - } - - /** - * Gets the reader id for the given reader. - * @param reader Reader for which to determine the id. - * @return id of the given reader. - */ - protected synchronized SAMReaderID getReaderID(SamReader reader) { - for(SAMReaders readers: allResources) { - SAMReaderID id = readers.getReaderID(reader); - if(id != null) - return id; - } - throw new ReviewedGATKException("No such reader id is available"); - } - - private synchronized void createNewResource() { - if(allResources.size() > maxEntries) - throw new ReviewedGATKException("Cannot create a new resource pool. All resources are in use."); - SAMReaders readers = new SAMReaders(readerIDs, validationStringency, removeProgramRecords); - allResources.add(readers); - availableResources.add(readers); - } - - } - - /** - * A collection of readers derived from a reads metadata structure. - */ - private class SAMReaders implements Iterable { - /** - * Cached representation of the merged header used to generate a merging iterator. - */ - private final SamFileHeaderMerger headerMerger; - - /** - * Internal storage for a map of id -> reader. - */ - private final Map readers = new LinkedHashMap(); - - /** - * The inptu streams backing - */ - private final Map inputStreams = new LinkedHashMap(); - - /** - * Derive a new set of readers from the Reads metadata. - * @param readerIDs reads to load. - * TODO: validationStringency is not used here - * @param validationStringency validation stringency. - * @param removeProgramRecords indicate whether to clear program records from the readers - */ - public SAMReaders(Collection readerIDs, ValidationStringency validationStringency, boolean removeProgramRecords) { - final int totalNumberOfFiles = readerIDs.size(); - int readerNumber = 1; - final SimpleTimer timer = new SimpleTimer().start(); - - if ( totalNumberOfFiles > 0 ) logger.info("Initializing SAMRecords in serial"); - final int tickSize = 50; - int nExecutedTotal = 0; - long lastTick = timer.currentTime(); - for(final SAMReaderID readerID: readerIDs) { - final ReaderInitializer init = new ReaderInitializer(readerID).call(); - - checkForUnsupportedBamFile(init.reader.getFileHeader()); - - if (removeProgramRecords) { - init.reader.getFileHeader().setProgramRecords(new ArrayList()); - } - - if (threadAllocation.getNumIOThreads() > 0) { - inputStreams.put(init.readerID, init.blockInputStream); // get from initializer - } - - logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, readerID.samFile)); - readers.put(init.readerID,init.reader); - if ( ++nExecutedTotal % tickSize == 0) { - double tickInSec = (timer.currentTime() - lastTick) / 1000.0; - printReaderPerformance(nExecutedTotal, tickSize, totalNumberOfFiles, timer, tickInSec); - lastTick = timer.currentTime(); - } - } - - if ( totalNumberOfFiles > 0 ) logger.info(String.format("Done initializing BAM readers: total time %.2f", timer.getElapsedTime())); - - Collection headers = new LinkedList(); - - // Examine the bam headers, perform any requested sample renaming on them, and add - // them to the list of headers to pass to the Picard SamFileHeaderMerger: - for ( final Map.Entry readerEntry : readers.entrySet() ) { - final SAMReaderID readerID = readerEntry.getKey(); - final SAMFileReader reader = readerEntry.getValue(); - final SAMFileHeader header = reader.getFileHeader(); - - // The remappedSampleName will be null if either no on-the-fly sample renaming was requested, - // or the user's sample rename map file didn't contain an entry for this bam file: - final String remappedSampleName = sampleRenameMap != null ? sampleRenameMap.get(readerID.getSamFilePath()) : null; - - // If we've been asked to rename the sample for this bam file, do so now. We'll check to - // make sure this bam only contains reads from one sample before proceeding. - // - // IMPORTANT: relies on the fact that the Picard SamFileHeaderMerger makes a copy of - // the existing read group attributes (including sample name) when merging - // headers, regardless of whether there are read group collisions or not. - if ( remappedSampleName != null ) { - remapSampleName(readerID, header, remappedSampleName); - } - - headers.add(header); - } - - headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,headers,true); - - // update all read groups to GATKSAMRecordReadGroups - final List gatkReadGroups = new LinkedList(); - for ( final SAMReadGroupRecord rg : headerMerger.getMergedHeader().getReadGroups() ) { - gatkReadGroups.add(new GATKSAMReadGroupRecord(rg)); - } - headerMerger.getMergedHeader().setReadGroups(gatkReadGroups); - } - - /** - * Changes the sample name in the read groups for the provided bam file header to match the - * remappedSampleName. Blows up with a UserException if the header contains more than one - * sample name. - * - * @param readerID ID for the bam file from which the provided header came from - * @param header The bam file header. Will be modified by this call. - * @param remappedSampleName New sample name to replace the existing sample attribute in the - * read groups for the header. - */ - private void remapSampleName( final SAMReaderID readerID, final SAMFileHeader header, final String remappedSampleName ) { - String firstEncounteredSample = null; - - for ( final SAMReadGroupRecord readGroup : header.getReadGroups() ) { - final String thisReadGroupSample = readGroup.getSample(); - - if ( thisReadGroupSample == null ) { - throw new UserException(String.format("On-the fly sample renaming was requested for bam file %s, however this " + - "bam file contains a read group (id: %s) with a null sample attribute", - readerID.getSamFilePath(), readGroup.getId())); - } - else if ( firstEncounteredSample == null ) { - firstEncounteredSample = thisReadGroupSample; - } - else if ( ! firstEncounteredSample.equals(thisReadGroupSample) ) { - throw new UserException(String.format("On-the-fly sample renaming was requested for bam file %s, " + - "however this bam file contains reads from more than one sample " + - "(encountered samples %s and %s in the bam header). The GATK requires that " + - "all bams for which on-the-fly sample renaming is requested " + - "contain reads from only a single sample per bam.", - readerID.getSamFilePath(), firstEncounteredSample, thisReadGroupSample)); - } - - readGroup.setSample(remappedSampleName); - } - } - - final private void printReaderPerformance(final int nExecutedTotal, - final int nExecutedInTick, - final int totalNumberOfFiles, - final SimpleTimer timer, - final double tickDurationInSec) { - final int pendingSize = totalNumberOfFiles - nExecutedTotal; - final double totalTimeInSeconds = timer.getElapsedTime(); - final double nTasksPerSecond = nExecutedTotal / (1.0*totalTimeInSeconds); - final int nRemaining = pendingSize; - final double estTimeToComplete = pendingSize / nTasksPerSecond; - logger.info(String.format("Init %d BAMs in last %.2f s, %d of %d in %.2f s / %.2f m (%.2f tasks/s). %d remaining with est. completion in %.2f s / %.2f m", - nExecutedInTick, tickDurationInSec, - nExecutedTotal, totalNumberOfFiles, totalTimeInSeconds, totalTimeInSeconds / 60, nTasksPerSecond, - nRemaining, estTimeToComplete, estTimeToComplete / 60)); - } - - /** - * Return the header derived from the merging of these BAM files. - * @return the merged header. - */ - public SAMFileHeader getMergedHeader() { - return headerMerger.getMergedHeader(); - } - - /** - * Do multiple read groups collide in this dataset? - * @return True if multiple read groups collide; false otherwis. - */ - public boolean hasReadGroupCollisions() { - return headerMerger.hasReadGroupCollisions(); - } - - /** - * Get the newly mapped read group ID for the given read group. - * @param readerID Reader for which to discern the transformed ID. - * @param originalReadGroupID Original read group. - * @return Remapped read group. - */ - public String getReadGroupId(final SAMReaderID readerID, final String originalReadGroupID) { - SAMFileHeader header = readers.get(readerID).getFileHeader(); - return headerMerger.getReadGroupId(header,originalReadGroupID); - } - - /** - * Creates a new merging iterator from the given map, with the given header. - * @param iteratorMap A map of readers to iterators. - * @return An iterator which will merge those individual iterators. - */ - public MergingSamRecordIterator createMergingIterator(final Map> iteratorMap) { - return new MergingSamRecordIterator(headerMerger,iteratorMap,true); - } - - /** - * Retrieve the reader from the data structure. - * @param id The ID of the reader to retrieve. - * @return the reader associated with the given id. - */ - public SAMFileReader getReader(SAMReaderID id) { - if(!readers.containsKey(id)) - throw new NoSuchElementException("No reader is associated with id " + id); - return readers.get(id); - } - - /** - * Retrieve the input stream backing a reader. - * @param id The ID of the reader to retrieve. - * @return the reader associated with the given id. - */ - public BlockInputStream getInputStream(final SAMReaderID id) { - return inputStreams.get(id); - } - - /** - * Searches for the reader id of this reader. - * @param reader Reader for which to search. - * @return The id associated the given reader, or null if the reader is not present in this collection. - */ - protected SAMReaderID getReaderID(SamReader reader) { - for(Map.Entry entry: readers.entrySet()) { - if(reader == entry.getValue()) - return entry.getKey(); - } - // Not found? return null. - return null; - } - - /** - * Returns an iterator over all readers in this structure. - * @return An iterator over readers. - */ - public Iterator iterator() { - return readers.values().iterator(); - } - - /** - * Returns whether any readers are present in this structure. - * @return - */ - public boolean isEmpty() { - return readers.isEmpty(); - } - } - - class ReaderInitializer implements Callable { - final SAMReaderID readerID; - BlockInputStream blockInputStream = null; - SAMFileReader reader; - - public ReaderInitializer(final SAMReaderID readerID) { - this.readerID = readerID; - } - - public ReaderInitializer call() { - final File indexFile = findIndexFile(readerID.samFile); - try { - if (threadAllocation.getNumIOThreads() > 0) - blockInputStream = new BlockInputStream(dispatcher,readerID,false); - reader = new SAMFileReader(readerID.samFile,indexFile,false); - } catch ( RuntimeIOException e ) { - throw new UserException.CouldNotReadInputFile(readerID.samFile, e); - } catch ( SAMFormatException e ) { - throw new UserException.MalformedBAM(readerID.samFile, e.getMessage()); - } - // Picard is throwing a RuntimeException here when BAMs are malformed with bad headers (and so look like SAM files). - // Let's keep this separate from the SAMFormatException (which ultimately derives from RuntimeException) case, - // just in case we want to change this behavior later. - catch ( RuntimeException e ) { - throw new UserException.MalformedBAM(readerID.samFile, e.getMessage()); - } - reader.setSAMRecordFactory(factory); - reader.enableFileSource(true); - reader.setValidationStringency(validationStringency); - return this; - } - } - - private class ReleasingIterator implements GATKSAMIterator { - /** - * The resource acting as the source of the data. - */ - private final SAMReaders resource; - - /** - * The iterator to wrap. - */ - private final GATKSAMIterator wrappedIterator; - - public ReleasingIterator(SAMReaders resource, GATKSAMIterator wrapped) { - this.resource = resource; - this.wrappedIterator = wrapped; - } - - public ReleasingIterator iterator() { - return this; - } - - public void remove() { - throw new UnsupportedOperationException("Can't remove from a GATKSAMIterator"); - } - - public void close() { - wrappedIterator.close(); - resourcePool.releaseReaders(resource); - } - - public boolean hasNext() { - return wrappedIterator.hasNext(); - } - - public SAMRecord next() { - return wrappedIterator.next(); - } - } - - /** - * Maps read groups in the original SAMFileReaders to read groups in - */ - private class ReadGroupMapping extends HashMap {} - - /** - * Locates the index file alongside the given BAM, if present. - * @param bamFile The data file to use. - * @return A File object if the index file is present; null otherwise. - */ - private File findIndexFile(File bamFile) { - return SamFiles.findIndex(bamFile); - } - - /** - * Creates a BAM schedule over all reads in the BAM file, both mapped and unmapped. The outgoing stream - * will be as granular as possible given our current knowledge of the best ways to split up BAM files. - * @return An iterator that spans all reads in all BAM files. - */ - public Iterable createShardIteratorOverAllReads(final ShardBalancer shardBalancer) { - shardBalancer.initialize(this,IntervalSharder.shardOverAllReads(this,genomeLocParser),genomeLocParser); - return shardBalancer; - } - - /** - * Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any - * read that has been assigned - * - * @param shardBalancer shard balancer object - * @return non-null initialized version of the shard balancer - */ - public Iterable createShardIteratorOverMappedReads(final ShardBalancer shardBalancer) { - shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,genomeLocParser),genomeLocParser); - return shardBalancer; - } - - /** - * Create a schedule for processing the initialized BAM file using the given interval list. - * The returned schedule should be as granular as possible. - * @param intervals The list of intervals for which to create the schedule. - * @return A granular iterator over file pointers. - */ - public Iterable createShardIteratorOverIntervals(final GenomeLocSortedSet intervals,final ShardBalancer shardBalancer) { - if(intervals == null) - throw new ReviewedGATKException("Unable to create schedule from intervals; no intervals were provided."); - shardBalancer.initialize(this,IntervalSharder.shardOverIntervals(SAMDataSource.this,intervals,intervalMergingRule),genomeLocParser); - return shardBalancer; - } -} - - - diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMReaderID.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMReaderID.java deleted file mode 100644 index ef5aaa040..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMReaderID.java +++ /dev/null @@ -1,125 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import org.broadinstitute.gatk.utils.commandline.Tags; - -import java.io.File; - -/** - * Uniquely identifies a SAM file reader. - * - * @author mhanna - * @version 0.1 - */ -public class SAMReaderID implements Comparable { - /** - * The SAM file at the heart of this reader. SAMReaderID - * currently supports only file-based readers. - */ - protected final File samFile; - - /** - * A list of tags associated with this BAM file. - */ - protected final Tags tags; - - /** - * Creates an identifier for a SAM file based on read. - * @param samFile The source file for SAM data. - * @param tags tags to use when creating a reader ID. - */ - public SAMReaderID(File samFile, Tags tags) { - this.samFile = samFile; - this.tags = tags; - } - - /** - * Creates an identifier for a SAM file based on read. - * @param samFileName The source filename for SAM data. - * @param tags tags to use when creating a reader ID. - */ - public SAMReaderID(String samFileName, Tags tags) { - this(new File(samFileName),tags); - } - - /** - * Gets the absolute pathname of this SAM file - * @return The absolute pathname of this reader's SAM file, - * or null if this reader has no associated SAM file - */ - public String getSamFilePath() { - if ( samFile == null ) { - return null; - } - - return samFile.getAbsolutePath(); - } - - /** - * Gets the tags associated with the given BAM file. - * @return A collection of the tags associated with this file. - */ - public Tags getTags() { - return tags; - } - - /** - * Compare two IDs to see whether they're equal. - * @param other The other identifier. - * @return True iff the two readers point to the same file. - */ - @Override - public boolean equals(Object other) { - if(other == null) return false; - if(!(other instanceof SAMReaderID)) return false; - - SAMReaderID otherID = (SAMReaderID)other; - return this.getSamFilePath().equals(otherID.getSamFilePath()); - } - - /** - * Generate a hash code for this object. - * @return A hash code, based solely on the file name at this point. - */ - @Override - public int hashCode() { - return samFile.getAbsolutePath().hashCode(); - } - - /** - * Best string representation for a SAM file reader is the path of the source file. - */ - @Override - public String toString() { - return getSamFilePath(); - } - - @Override - public int compareTo(Object other) { - return this.samFile.getAbsolutePath().compareTo(((SAMReaderID)other).samFile.getAbsolutePath()); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/Shard.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/Shard.java deleted file mode 100644 index cc8944ce3..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/Shard.java +++ /dev/null @@ -1,253 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import htsjdk.samtools.util.PeekableIterator; -import htsjdk.samtools.SAMFileSpan; -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.ReadMetrics; -import org.broadinstitute.gatk.engine.ReadProperties; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.HasGenomeLocation; - -import java.util.Collections; -import java.util.List; -import java.util.Map; -/** - * - * User: aaron - * Date: Apr 10, 2009 - * Time: 5:00:27 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - -/** - * @author aaron - * @version 1.0 - * @date Apr 10, 2009 - *

    - * Interface Shard - *

    - * The base abstract class for shards. - */ -public abstract class Shard implements HasGenomeLocation { - public enum ShardType { - READ, LOCUS - } - - protected final GenomeLocParser parser; // incredibly annoying! - - /** - * What type of shard is this? Read or locus? - */ - protected final ShardType shardType; - - /** - * Locations. - */ - protected final List locs; - - /** - * Whether the current location is unmapped. - */ - private final boolean isUnmapped; - - /** - * Reads data, if applicable. - */ - private final SAMDataSource readsDataSource; - - /** - * The data backing the next chunks to deliver to the traversal engine. - */ - private final Map fileSpans; - - /** - * Lazy-calculated span of all of the genome locs in this shard - */ - private GenomeLoc spanningLocation = null; - - /** - * Statistics about which reads in this shards were used and which were filtered away. - */ - protected final ReadMetrics readMetrics = new ReadMetrics(); - - /** - * Whether this shard points to an unmapped region. - * Some shard types conceptually be unmapped (e.g. LocusShards). In - * this case, isUnmapped should always return false. - * @return True if this shard is unmapped. False otherwise. - */ - public boolean isUnmapped() { - return isUnmapped; - } - - public Shard(GenomeLocParser parser, - ShardType shardType, - List locs, - SAMDataSource readsDataSource, - Map fileSpans, - boolean isUnmapped) { - this.locs = locs; - this.parser = parser; - this.shardType = shardType; - this.readsDataSource = readsDataSource; - this.fileSpans = fileSpans; - this.isUnmapped = isUnmapped; - } - - /** - * If isUnmapped is true, than getGenomeLocs by - * definition will return a singleton list with a GenomeLoc.UNMAPPED - * - * Can return null, indicating that the entire genome is covered. - * - * @return the genome location represented by this shard - */ - public List getGenomeLocs() { - return locs; - } - - /** - * Get the list of chunks delimiting this shard. - * @return a list of chunks that contain data for this shard. - */ - public Map getFileSpans() { - return Collections.unmodifiableMap(fileSpans); - } - - /** - * Returns the span of the genomeLocs comprising this shard - * @return a GenomeLoc that starts as the first position in getGenomeLocs() and stops at the stop of the last - * position in getGenomeLocs() - */ - public GenomeLoc getLocation() { - if ( spanningLocation == null ) { - if ( getGenomeLocs() == null ) - spanningLocation = GenomeLoc.WHOLE_GENOME; - else if ( getGenomeLocs().size() == 0 ) { - spanningLocation = getGenomeLocs().get(0); - } else { - int start = Integer.MAX_VALUE; - int stop = Integer.MIN_VALUE; - String contig = null; - - for ( GenomeLoc loc : getGenomeLocs() ) { - if ( GenomeLoc.isUnmapped(loc) ) - // special case the unmapped region marker, just abort out - return loc; - contig = loc.getContig(); - if ( loc.getStart() < start ) start = loc.getStart(); - if ( loc.getStop() > stop ) stop = loc.getStop(); - } - - spanningLocation = parser.createGenomeLoc(contig, start, stop); - } - } - - return spanningLocation; - } - - - /** - * what kind of shard do we return - * @return ShardType, indicating the type - */ - public ShardType getShardType() { - return shardType; - } - - /** - * Does any releasing / aggregation required when the shard is through being processed. - */ - public void close() { - readsDataSource.incorporateReadMetrics(readMetrics); - } - - /** - * Gets key read validation and filtering properties. - * @return set of read properties associated with this shard. - */ - public ReadProperties getReadProperties() { - return readsDataSource.getReadsInfo(); - } - - /** - * Gets the runtime metrics associated with this shard. - * Retrieves a storage space of metrics about number of reads included, filtered, etc. - * @return Storage space for metrics. - */ - public ReadMetrics getReadMetrics() { - return readMetrics; - } - - /** - * Returns true if this shard is meant to buffer reads, rather - * than just holding pointers to their locations. - * @return True if this shard can buffer reads. False otherwise. - */ - public boolean buffersReads() { return false; } - - /** - * Returns true if the read buffer is currently full. - * @return True if this shard's buffer is full (and the shard can buffer reads). - */ - public boolean isBufferEmpty() { throw new UnsupportedOperationException("This shard does not buffer reads."); } - - /** - * Returns true if the read buffer is currently full. - * @return True if this shard's buffer is full (and the shard can buffer reads). - */ - public boolean isBufferFull() { throw new UnsupportedOperationException("This shard does not buffer reads."); } - - /** - * Adds a read to the read buffer. - * @param read Add a read to the internal shard buffer. - */ - public void addRead(SAMRecord read) { throw new UnsupportedOperationException("This shard does not buffer reads."); } - - /** - * Fills the shard with reads. Can only do this with shards that buffer reads - * @param readIter Iterator from which to draw the reads to fill the shard - */ - public void fill( PeekableIterator readIter ) { throw new UnsupportedOperationException("This shard does not buffer reads."); } - - /** - * Gets the iterator over the elements cached in the shard. - * @return - */ - public GATKSAMIterator iterator() { throw new UnsupportedOperationException("This shard does not buffer reads."); } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/FindLargeShards.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/FindLargeShards.java deleted file mode 100644 index 9105b4cf8..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/FindLargeShards.java +++ /dev/null @@ -1,192 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads.utilities; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; -import org.broadinstitute.gatk.utils.commandline.Input; -import org.broadinstitute.gatk.utils.commandline.Output; -import org.broadinstitute.gatk.engine.datasources.reads.FilePointer; -import org.broadinstitute.gatk.engine.datasources.reads.IntervalSharder; -import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; -import org.broadinstitute.gatk.engine.datasources.reads.SAMReaderID; -import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.GenomeLocSortedSet; -import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; -import org.broadinstitute.gatk.utils.interval.IntervalUtils; -import org.broadinstitute.gatk.utils.text.ListFileUtils; - -import java.io.File; -import java.io.IOException; -import java.io.PrintStream; -import java.math.BigInteger; -import java.util.ArrayList; -import java.util.List; - -/** - * Traverses a region in a dataset looking for outliers. - */ -public class FindLargeShards extends CommandLineProgram { - private static Logger logger = Logger.getLogger(FindLargeShards.class); - - @Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = false) - public List samFiles = new ArrayList(); - - @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false) - public File referenceFile = null; - - @Input(fullName = "intervals", shortName = "L", doc = "A list of genomic intervals over which to operate. Can be explicitly specified on the command line or in a file.",required=false) - public List intervals = null; - - @Output(required=false) - public PrintStream out = System.out; - - /** - * The square of the sum of all uncompressed data. Based on the BAM spec, the size of this could be - * up to (2^64)^2. - */ - private BigInteger sumOfSquares = BigInteger.valueOf(0); - - /** - * The running sum of all uncompressed data. Based on the BAM spec, the BAM must be less than Long.MAX_LONG - * when compressed -- in other words, the sum of the sizes of all BGZF blocks must be < 2^64. - */ - private BigInteger sum = BigInteger.valueOf(0); - - /** - * The number of shards viewed. - */ - private long numberOfShards; - - - @Override - public int execute() throws IOException { - // initialize reference - IndexedFastaSequenceFile refReader = new IndexedFastaSequenceFile(referenceFile); - GenomeLocParser genomeLocParser = new GenomeLocParser(refReader); - - // initialize reads - List bamReaders = ListFileUtils.unpackBAMFileList(samFiles,parser); - SAMDataSource dataSource = new SAMDataSource(bamReaders,new ThreadAllocation(),null,genomeLocParser); - - // intervals - final GenomeLocSortedSet intervalSortedSet; - if ( intervals != null ) - intervalSortedSet = IntervalUtils.sortAndMergeIntervals(genomeLocParser, IntervalUtils.parseIntervalArguments(genomeLocParser, intervals), IntervalMergingRule.ALL); - else - intervalSortedSet = GenomeLocSortedSet.createSetFromSequenceDictionary(refReader.getSequenceDictionary()); - - logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize")); - - IntervalSharder sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet,IntervalMergingRule.ALL); - while(sharder.hasNext()) { - FilePointer filePointer = sharder.next(); - - // Size of the file pointer. - final long size = filePointer.size(); - - BigInteger bigSize = BigInteger.valueOf(size); - sumOfSquares = sumOfSquares.add(bigSize.pow(2)); - sum = sum.add(bigSize); - numberOfShards++; - - if(numberOfShards % 1000 == 0) { - GenomeLoc boundingRegion = getBoundingRegion(filePointer,genomeLocParser); - logger.info(String.format("PROGRESS: Calculating mean and variance: %s\t%d\t%d\t%d",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size)); - } - - } - - // Print out the stddev: (sum(x^2) - (1/N)*sum(x)^2)/N - long mean = sum.divide(BigInteger.valueOf(numberOfShards)).longValue(); - long stddev = (long)(Math.sqrt(sumOfSquares.subtract(sum.pow(2).divide(BigInteger.valueOf(numberOfShards))).divide(BigInteger.valueOf(numberOfShards)).doubleValue())); - logger.info(String.format("Number of shards: %d; mean uncompressed size = %d; stddev uncompressed size = %d%n",numberOfShards,mean,stddev)); - - // Crank through the shards again, this time reporting on the shards significantly larger than the mean. - long threshold = mean + stddev*5; - logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize")); - out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n"); - - sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet,IntervalMergingRule.ALL); - while(sharder.hasNext()) { - FilePointer filePointer = sharder.next(); - - // Bounding region. - GenomeLoc boundingRegion = getBoundingRegion(filePointer,genomeLocParser); - - // Size of the file pointer. - final long size = filePointer.size(); - - numberOfShards++; - - if(filePointer.size() <= threshold) { - if(numberOfShards % 1000 == 0) - logger.info(String.format("PROGRESS: Searching for large shards: %s\t%d\t%d\t%d",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size)); - continue; - } - - out.printf("%s\t%d\t%d\t%d%n",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size); - } - - return 0; - } - - private GenomeLoc getBoundingRegion(final FilePointer filePointer, final GenomeLocParser genomeLocParser) { - List regions = filePointer.getLocations(); - - // The region contained by this FilePointer. - final String contig = regions.get(0).getContig(); - final int start = regions.get(0).getStart(); - final int stop = regions.get(regions.size()-1).getStop(); - - return genomeLocParser.createGenomeLoc(contig,start,stop); - } - - /** - * Required main method implementation. - * @param argv Command-line argument text. - * @throws Exception on error. - */ - public static void main(String[] argv) throws Exception { - int returnCode = 0; - try { - FindLargeShards instance = new FindLargeShards(); - start(instance, argv); - returnCode = 0; - } - catch(Exception ex) { - returnCode = 1; - ex.printStackTrace(); - throw ex; - } - finally { - System.exit(returnCode); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSource.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSource.java deleted file mode 100644 index 6fdbea3a0..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSource.java +++ /dev/null @@ -1,199 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reference; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import htsjdk.samtools.SAMSequenceRecord; -import org.broadinstitute.gatk.engine.datasources.reads.LocusShard; -import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; -import org.broadinstitute.gatk.engine.datasources.reads.Shard; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.GenomeLocSortedSet; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; - -import java.io.File; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -/** - * Loads reference data from fasta file - * Looks for fai and dict files, and tries to create them if they don't exist - */ -public class ReferenceDataSource { - private IndexedFastaSequenceFile reference; - - /** our log, which we want to capture anything from this class */ - protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(ReferenceDataSource.class); - - /** - * Create reference data source from fasta file - * @param fastaFile Fasta file to be used as reference - */ - public ReferenceDataSource(File fastaFile) { - // does the fasta file exist? check that first... - if (!fastaFile.exists()) - throw new UserException("The fasta file you specified (" + fastaFile.getAbsolutePath() + ") does not exist."); - - final boolean isGzipped = fastaFile.getAbsolutePath().endsWith(".gz"); - if ( isGzipped ) { - throw new UserException.CannotHandleGzippedRef(); - } - - final File indexFile = new File(fastaFile.getAbsolutePath() + ".fai"); - - // determine the name for the dict file - final String fastaExt = fastaFile.getAbsolutePath().endsWith("fa") ? "\\.fa$" : "\\.fasta$"; - final File dictFile = new File(fastaFile.getAbsolutePath().replaceAll(fastaExt, ".dict")); - - // It's an error if either the fai or dict file does not exist. The user is now responsible - // for creating these files. - if (!indexFile.exists()) { - throw new UserException.MissingReferenceFaiFile(indexFile, fastaFile); - } - if (!dictFile.exists()) { - throw new UserException.MissingReferenceDictFile(dictFile, fastaFile); - } - - // Read reference data by creating an IndexedFastaSequenceFile. - try { - reference = new CachingIndexedFastaSequenceFile(fastaFile); - } - catch (IllegalArgumentException e) { - throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read reference sequence. The FASTA must have either a .fasta or .fa extension", e); - } - catch (Exception e) { - throw new UserException.CouldNotReadInputFile(fastaFile, e); - } - } - - /** - * Get indexed fasta file - * @return IndexedFastaSequenceFile that was created from file - */ - public IndexedFastaSequenceFile getReference() { - return this.reference; - } - - /** - * Creates an iterator for processing the entire reference. - * @param readsDataSource the reads datasource to embed in the locus shard. - * @param parser used to generate/regenerate intervals. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources. - * @param maxShardSize The maximum shard size which can be used to create this list. - * @return Creates a schedule for performing a traversal over the entire reference. - */ - public Iterable createShardsOverEntireReference(final SAMDataSource readsDataSource, final GenomeLocParser parser, final int maxShardSize) { - List shards = new ArrayList(); - for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) { - for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) { - final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength()); - shards.add(new LocusShard(parser, - readsDataSource, - Collections.singletonList(parser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)), - null)); - } - } - return shards; - } - - - public Iterable createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int maxShardSize) { - List shards = new ArrayList(); - - for(GenomeLoc interval: intervals) { - while(interval.size() > maxShardSize) { - shards.add(new LocusShard(intervals.getGenomeLocParser(), - readsDataSource, - Collections.singletonList(intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)), - null)); - interval = intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop()); - } - shards.add(new LocusShard(intervals.getGenomeLocParser(), - readsDataSource, - Collections.singletonList(interval), - null)); - } - - return shards; - } - - - /** - * Creates an iterator for processing the entire reference. - * @param readsDataSource the reads datasource to embed in the locus shard. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources. - * @param intervals the list of intervals to use when processing the reference. - * @param targetShardSize the suggested - and maximum - shard size which can be used to create this list; we will merge intervals greedily so that we generate shards up to but not greater than the target size. - * @return Creates a schedule for performing a traversal over the entire reference. - */ -/* - public Iterable createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int targetShardSize) { - final List shards = new ArrayList(); - final GenomeLocParser parser = intervals.getGenomeLocParser(); - LinkedList currentIntervals = new LinkedList(); - - for(GenomeLoc interval: intervals) { - // if the next interval is too big, we can safely shard currentInterval and then break down this one - if (interval.size() > targetShardSize) { - if (!currentIntervals.isEmpty()) - shards.add(createShardFromInterval(currentIntervals, readsDataSource, parser)); - while(interval.size() > targetShardSize) { - final GenomeLoc partialInterval = parser.createGenomeLoc(interval.getContig(), interval.getStart(), interval.getStart()+targetShardSize-1); - shards.add(createShardFromInterval(Collections.singletonList(partialInterval), readsDataSource, parser)); - interval = parser.createGenomeLoc(interval.getContig(), interval.getStart() + targetShardSize, interval.getStop()); - } - currentIntervals = new LinkedList(); - currentIntervals.add(interval); - } - // otherwise, we need to check whether we can merge this interval with currentInterval (and either shard currentInterval or merge accordingly) - else { - if (currentIntervals.isEmpty()) { - currentIntervals.add(interval); - } - else { - if (currentIntervals.getLast().compareContigs(interval) != 0 || interval.getStop() - currentIntervals.getLast().getStart() + 1 > targetShardSize) { - shards.add(createShardFromInterval(currentIntervals, readsDataSource, parser)); - currentIntervals = new LinkedList(); - } - currentIntervals.add(interval); - } - } - } - if (!currentIntervals.isEmpty()) - shards.add(createShardFromInterval(currentIntervals, readsDataSource, parser)); - return shards; - } - - private static Shard createShardFromInterval(final List intervals, final SAMDataSource readsDataSource, final GenomeLocParser parser) { - //logger.debug("Adding shard " + interval); - return new LocusShard(parser, - readsDataSource, - intervals, - null); - } -*/ -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPool.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPool.java deleted file mode 100644 index 762eb0b44..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPool.java +++ /dev/null @@ -1,153 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.rmd; - -import htsjdk.samtools.SAMSequenceDictionary; -import org.broadinstitute.gatk.engine.refdata.SeekableRODIterator; -import org.broadinstitute.gatk.engine.refdata.tracks.RMDTrack; -import org.broadinstitute.gatk.engine.refdata.tracks.RMDTrackBuilder; -import org.broadinstitute.gatk.engine.refdata.utils.FlashBackIterator; -import org.broadinstitute.gatk.engine.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.gatk.engine.refdata.utils.RMDTriplet; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.util.List; - -/** - * A pool of reference-ordered data iterators. - */ -class ReferenceOrderedDataPool extends ResourcePool { - // the reference-ordered data itself. - private final RMDTriplet fileDescriptor; - - // our tribble track builder - private final RMDTrackBuilder builder; - - /** - * The header from this RMD, if present. - */ - private final Object header; - - /** - * The sequence dictionary from this ROD. If no sequence dictionary is present, this dictionary will be the same as the reference's. - */ - private final SAMSequenceDictionary sequenceDictionary; - - boolean flashbackData = false; - public ReferenceOrderedDataPool(RMDTriplet fileDescriptor,RMDTrackBuilder builder,SAMSequenceDictionary sequenceDictionary, GenomeLocParser genomeLocParser,boolean flashbackData) { - super(sequenceDictionary,genomeLocParser); - this.fileDescriptor = fileDescriptor; - this.builder = builder; - this.flashbackData = flashbackData; - - // prepopulate one RMDTrack - LocationAwareSeekableRODIterator iterator = createNewResource(); - this.addNewResource(iterator); - - // Pull the proper header and sequence dictionary from the prepopulated track. - this.header = iterator.getHeader(); - this.sequenceDictionary = iterator.getSequenceDictionary(); - } - - /** - * Gets the header used by this resource pool. - * @return Header used by this resource pool. - */ - public Object getHeader() { - return header; - } - - /** - * Gets the sequence dictionary built into the ROD index file. - * @return Sequence dictionary from the index file. - */ - public SAMSequenceDictionary getSequenceDictionary() { - return sequenceDictionary; - } - - /** - * Create a new iterator from the existing reference-ordered data. This new iterator is expected - * to be completely independent of any other iterator. - * @return The newly created resource. - */ - public LocationAwareSeekableRODIterator createNewResource() { - if(numIterators() > 0) - throw new ReviewedGATKException("BUG: Tried to create multiple iterators over streaming ROD interface"); - RMDTrack track = builder.createInstanceOfTrack(fileDescriptor); - LocationAwareSeekableRODIterator iter = new SeekableRODIterator(track.getHeader(),track.getSequenceDictionary(),referenceSequenceDictionary,genomeLocParser,track.getIterator()); - return (flashbackData) ? new FlashBackIterator(iter) : iter; - } - - /** - * Finds the best existing ROD iterator from the pool. In this case, the best existing ROD is defined as - * the first one encountered that is at or before the given position. - * @param segment @{inheritedDoc} - * @param resources @{inheritedDoc} - * @return @{inheritedDoc} - */ - public LocationAwareSeekableRODIterator selectBestExistingResource( DataStreamSegment segment, List resources ) { - if(segment instanceof MappedStreamSegment) { - GenomeLoc position = ((MappedStreamSegment)segment).getLocation(); - - for( LocationAwareSeekableRODIterator RODIterator : resources ) { - - if( (RODIterator.position() == null && RODIterator.hasNext()) || - (RODIterator.position() != null && RODIterator.position().isBefore(position)) ) - return RODIterator; - if (RODIterator.position() != null && RODIterator instanceof FlashBackIterator && ((FlashBackIterator)RODIterator).canFlashBackTo(position)) { - ((FlashBackIterator)RODIterator).flashBackTo(position); - return RODIterator; - } - - } - return null; - } - else if(segment instanceof EntireStream) { - // Asking for a segment over the entire stream, so by definition, there is no best existing resource. - // Force the system to create a new one. - return null; - } - else { - throw new ReviewedGATKException("Unable to find a ROD iterator for segments of type " + segment.getClass()); - } - } - - /** - * In this case, the iterator is the resource. Pass it through. - */ - public LocationAwareSeekableRODIterator createIteratorFromResource( DataStreamSegment segment, LocationAwareSeekableRODIterator resource ) { - return resource; - } - - /** - * kill the buffers in the iterator - */ - public void closeResource( LocationAwareSeekableRODIterator resource ) { - if (resource instanceof FlashBackIterator) ((FlashBackIterator)resource).close(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataSource.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataSource.java deleted file mode 100644 index 9d9e7c87f..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataSource.java +++ /dev/null @@ -1,256 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.rmd; - -import htsjdk.samtools.SAMSequenceDictionary; -import org.broadinstitute.gatk.utils.commandline.Tags; -import org.broadinstitute.gatk.engine.refdata.SeekableRODIterator; -import org.broadinstitute.gatk.engine.refdata.tracks.RMDTrack; -import org.broadinstitute.gatk.engine.refdata.tracks.RMDTrackBuilder; -import org.broadinstitute.gatk.engine.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.gatk.engine.refdata.utils.RMDTriplet; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.lang.reflect.Type; -import java.util.List; - -/** - * A data source which provides a single type of reference-ordered data. - */ -public class ReferenceOrderedDataSource { - /** - * The reference-ordered data itself. - */ - private final RMDTriplet fileDescriptor; - - /** - * The header associated with this VCF, if any. - */ - private final Object header; - - /** - * The private sequence dictionary associated with this RMD. - */ - private final SAMSequenceDictionary sequenceDictionary; - - /** - * The builder to use when constructing new reference-ordered data readers. - */ - private final RMDTrackBuilder builder; - - /** - * A pool of iterators for navigating through the genome. - */ - private final ResourcePool iteratorPool; - - /** - * Create a new reference-ordered data source. - */ - public ReferenceOrderedDataSource(RMDTriplet fileDescriptor, - RMDTrackBuilder builder, - SAMSequenceDictionary referenceSequenceDictionary, - GenomeLocParser genomeLocParser, - boolean flashbackData ) { - this.fileDescriptor = fileDescriptor; - this.builder = builder; - - // TODO: Unify the two blocks of code below by creating a ReferenceOrderedDataPool base class of a coherent type (not RMDTrack for one and SeekableIterator for the other). - if (fileDescriptor.getStorageType() != RMDTriplet.RMDStorageType.STREAM) { - iteratorPool = new ReferenceOrderedQueryDataPool(fileDescriptor, - builder, - referenceSequenceDictionary, - genomeLocParser); - this.header = ((ReferenceOrderedQueryDataPool)iteratorPool).getHeader(); - this.sequenceDictionary = ((ReferenceOrderedQueryDataPool)iteratorPool).getSequenceDictionary(); - } - else { - iteratorPool = new ReferenceOrderedDataPool(fileDescriptor, - builder, - referenceSequenceDictionary, - genomeLocParser, - flashbackData); - this.header = ((ReferenceOrderedDataPool)iteratorPool).getHeader(); - this.sequenceDictionary = ((ReferenceOrderedDataPool)iteratorPool).getSequenceDictionary(); - } - } - - /** - * Return the name of the underlying reference-ordered data. - * @return Name of the underlying rod. - */ - public String getName() { - return fileDescriptor.getName(); - } - - public Class getType() { - return builder.getFeatureManager().getByTriplet(fileDescriptor).getCodecClass(); - } - - public Class getRecordType() { - return builder.getFeatureManager().getByTriplet(fileDescriptor).getFeatureClass(); - } - - public File getFile() { - return new File(fileDescriptor.getFile()); - } - - public Object getHeader() { - return header; - } - - public Tags getTags() { - return fileDescriptor.getTags(); - } - - public String getTagValue( final String key ) { - return fileDescriptor.getTags().getValue( key ); - } - - - /** - * Retrieves the sequence dictionary created by this ROD. - * @return - */ - public SAMSequenceDictionary getSequenceDictionary() { - return sequenceDictionary; - } - - /** - * helper function for determining if we are the same track based on name and record type - * - * @param name the name to match - * @param type the type to match - * - * @return true on a match, false if the name or type is different - */ - public boolean matchesNameAndRecordType(String name, Type type) { - return (name.equals(fileDescriptor.getName()) && (type.getClass().isAssignableFrom(getType().getClass()))); - } - - /** - * Seek to the specified position and return an iterator through the data. - * - * @param loc GenomeLoc that points to the selected position. - * - * @return Iterator through the data. - */ - public LocationAwareSeekableRODIterator seek(GenomeLoc loc) { - DataStreamSegment dataStreamSegment = loc != null ? new MappedStreamSegment(loc) : new EntireStream(); - return iteratorPool.iterator(dataStreamSegment); - } - - - /** - * Close the specified iterator, returning it to the pool. - * @param iterator Iterator to close. - */ - public void close( LocationAwareSeekableRODIterator iterator ) { - iteratorPool.release(iterator); - } - -} - -/** - * a data pool for the new query based RODs - */ -class ReferenceOrderedQueryDataPool extends ResourcePool { - // the reference-ordered data itself. - private final RMDTriplet fileDescriptor; - - // our tribble track builder - private final RMDTrackBuilder builder; - - /** - * The header from this RMD, if present. - */ - private final Object header; - - /** - * The sequence dictionary from this ROD. If no sequence dictionary is present, this dictionary will be the same as the reference's. - */ - private final SAMSequenceDictionary sequenceDictionary; - - public ReferenceOrderedQueryDataPool(RMDTriplet fileDescriptor, RMDTrackBuilder builder, SAMSequenceDictionary referenceSequenceDictionary, GenomeLocParser genomeLocParser) { - super(referenceSequenceDictionary,genomeLocParser); - this.fileDescriptor = fileDescriptor; - this.builder = builder; - - // prepopulate one RMDTrack - RMDTrack track = builder.createInstanceOfTrack(fileDescriptor); - this.addNewResource(track); - - // Pull the proper header and sequence dictionary from the prepopulated track. - this.header = track.getHeader(); - this.sequenceDictionary = track.getSequenceDictionary(); - } - - public Object getHeader() { - return header; - } - - public SAMSequenceDictionary getSequenceDictionary() { - return sequenceDictionary; - } - - @Override - protected RMDTrack createNewResource() { - return builder.createInstanceOfTrack(fileDescriptor); - } - - @Override - protected RMDTrack selectBestExistingResource(DataStreamSegment segment, List availableResources) { - for (RMDTrack reader : availableResources) - if (reader != null) return reader; - return null; - } - - @Override - protected LocationAwareSeekableRODIterator createIteratorFromResource(DataStreamSegment position, RMDTrack track) { - try { - if (position instanceof MappedStreamSegment) { - GenomeLoc pos = ((MappedStreamSegment) position).locus; - return new SeekableRODIterator(header,sequenceDictionary,referenceSequenceDictionary,genomeLocParser,track.query(pos)); - } else { - return new SeekableRODIterator(header,sequenceDictionary,referenceSequenceDictionary,genomeLocParser,track.getIterator()); - } - } catch (FileNotFoundException e) { - throw new UserException.CouldNotReadInputFile(fileDescriptor.getName(), "it could not be found"); - } catch (IOException e) { - throw new ReviewedGATKException("Unable to create iterator for rod named " + fileDescriptor.getName(),e); - } - } - - @Override - protected void closeResource(RMDTrack track) { - track.close(); - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/AlleleBiasedDownsamplingUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/AlleleBiasedDownsamplingUtils.java deleted file mode 100644 index 0bcf4ee62..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/AlleleBiasedDownsamplingUtils.java +++ /dev/null @@ -1,369 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.BaseUtils; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.collections.DefaultHashMap; -import org.broadinstitute.gatk.utils.exceptions.GATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.pileup.PileupElement; -import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; -import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.text.XReadLines; -import htsjdk.variant.variantcontext.Allele; - -import java.io.File; -import java.io.IOException; -import java.util.*; - -public class AlleleBiasedDownsamplingUtils { - - // define this class so that we can use Java generics below - private final static class PileupElementList extends ArrayList {} - - /** - * Computes an allele biased version of the given pileup - * - * @param pileup the original pileup - * @param downsamplingFraction the fraction of total reads to remove per allele - * @return allele biased pileup - */ - public static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) { - // special case removal of all or no reads - if ( downsamplingFraction <= 0.0 ) - return pileup; - if ( downsamplingFraction >= 1.0 ) - return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList()); - - final PileupElementList[] alleleStratifiedElements = new PileupElementList[4]; - for ( int i = 0; i < 4; i++ ) - alleleStratifiedElements[i] = new PileupElementList(); - - // start by stratifying the reads by the alleles they represent at this position - for ( final PileupElement pe : pileup ) { - final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); - if ( baseIndex != -1 ) - alleleStratifiedElements[baseIndex].add(pe); - } - - // make a listing of allele counts and calculate the total count - final int[] alleleCounts = calculateAlleleCounts(alleleStratifiedElements); - final int totalAlleleCount = (int)MathUtils.sum(alleleCounts); - - // do smart down-sampling - final int numReadsToRemove = (int)(totalAlleleCount * downsamplingFraction); // floor - final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove); - - final HashSet readsToRemove = new HashSet(numReadsToRemove); - for ( int i = 0; i < 4; i++ ) { - final PileupElementList alleleList = alleleStratifiedElements[i]; - // if we don't need to remove any reads, then don't - if ( alleleCounts[i] > targetAlleleCounts[i] ) - readsToRemove.addAll(downsampleElements(alleleList, alleleCounts[i], alleleCounts[i] - targetAlleleCounts[i])); - } - - // we need to keep the reads sorted because the FragmentUtils code will expect them in coordinate order and will fail otherwise - final List readsToKeep = new ArrayList(totalAlleleCount - numReadsToRemove); - for ( final PileupElement pe : pileup ) { - if ( !readsToRemove.contains(pe) ) { - readsToKeep.add(pe); - } - } - - return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList(readsToKeep)); - } - - /** - * Calculates actual allele counts for each allele (which can be different than the list size when reduced reads are present) - * - * @param alleleStratifiedElements pileup elements stratified by allele - * @return non-null int array representing allele counts - */ - private static int[] calculateAlleleCounts(final PileupElementList[] alleleStratifiedElements) { - final int[] alleleCounts = new int[alleleStratifiedElements.length]; - for ( int i = 0; i < alleleStratifiedElements.length; i++ ) { - alleleCounts[i] = alleleStratifiedElements[i].size(); - } - return alleleCounts; - } - - private static int scoreAlleleCounts(final int[] alleleCounts) { - if ( alleleCounts.length < 2 ) - return 0; - - // sort the counts (in ascending order) - final int[] alleleCountsCopy = alleleCounts.clone(); - Arrays.sort(alleleCountsCopy); - - final int maxCount = alleleCountsCopy[alleleCounts.length - 1]; - final int nextBestCount = alleleCountsCopy[alleleCounts.length - 2]; - - int remainderCount = 0; - for ( int i = 0; i < alleleCounts.length - 2; i++ ) - remainderCount += alleleCountsCopy[i]; - - // try to get the best score: - // - in the het case the counts should be equal with nothing else - // - in the hom case the non-max should be zero - return Math.min(maxCount - nextBestCount + remainderCount, Math.abs(nextBestCount + remainderCount)); - } - - /** - * Computes an allele biased version of the allele counts for a given pileup - * - * @param alleleCounts the allele counts for the original pileup - * @param numReadsToRemove number of total reads to remove per allele - * @return non-null array of new counts needed per allele - */ - protected static int[] runSmartDownsampling(final int[] alleleCounts, final int numReadsToRemove) { - final int numAlleles = alleleCounts.length; - - int maxScore = scoreAlleleCounts(alleleCounts); - int[] alleleCountsOfMax = alleleCounts; - - final int numReadsToRemovePerAllele = numReadsToRemove / 2; - - for ( int i = 0; i < numAlleles; i++ ) { - for ( int j = i; j < numAlleles; j++ ) { - final int[] newCounts = alleleCounts.clone(); - - // split these cases so we don't lose on the floor (since we divided by 2) - if ( i == j ) { - newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemove); - } else { - newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemovePerAllele); - newCounts[j] = Math.max(0, newCounts[j] - numReadsToRemovePerAllele); - } - - final int score = scoreAlleleCounts(newCounts); - - if ( score < maxScore ) { - maxScore = score; - alleleCountsOfMax = newCounts; - } - } - } - - return alleleCountsOfMax; - } - - /** - * Performs allele biased down-sampling on a pileup and computes the list of elements to remove - * - * @param elements original list of pileup elements - * @param originalElementCount original count of elements (taking reduced reads into account) - * @param numElementsToRemove the number of records to remove - * @return the list of pileup elements TO REMOVE - */ - protected static List downsampleElements(final List elements, final int originalElementCount, final int numElementsToRemove) { - // are there no elements to remove? - if ( numElementsToRemove == 0 ) - return Collections.emptyList(); - - final ArrayList elementsToRemove = new ArrayList(numElementsToRemove); - - // should we remove all of the elements? - if ( numElementsToRemove >= originalElementCount ) { - elementsToRemove.addAll(elements); - return elementsToRemove; - } - - // create a bitset describing which elements to remove - final BitSet itemsToRemove = new BitSet(originalElementCount); - for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) { - itemsToRemove.set(selectedIndex); - } - - int currentBitSetIndex = 0; - for ( final PileupElement element : elements ) { - if ( itemsToRemove.get(currentBitSetIndex++) ) { - elementsToRemove.add(element); - } - } - - return elementsToRemove; - } - - /** - * Computes reads to remove based on an allele biased down-sampling - * - * @param alleleReadMap original list of records per allele - * @param downsamplingFraction the fraction of total reads to remove per allele - * @return list of reads TO REMOVE from allele biased down-sampling - */ - public static List selectAlleleBiasedReads(final Map> alleleReadMap, final double downsamplingFraction) { - int totalReads = 0; - for ( final List reads : alleleReadMap.values() ) - totalReads += reads.size(); - - int numReadsToRemove = (int)(totalReads * downsamplingFraction); - - // make a listing of allele counts - final List alleles = new ArrayList(alleleReadMap.keySet()); - alleles.remove(Allele.NO_CALL); // ignore the no-call bin - final int numAlleles = alleles.size(); - - final int[] alleleCounts = new int[numAlleles]; - for ( int i = 0; i < numAlleles; i++ ) - alleleCounts[i] = alleleReadMap.get(alleles.get(i)).size(); - - // do smart down-sampling - final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove); - - final List readsToRemove = new ArrayList(numReadsToRemove); - for ( int i = 0; i < numAlleles; i++ ) { - if ( alleleCounts[i] > targetAlleleCounts[i] ) { - readsToRemove.addAll(downsampleElements(alleleReadMap.get(alleles.get(i)), alleleCounts[i] - targetAlleleCounts[i])); - } - } - - return readsToRemove; - } - - /** - * Performs allele biased down-sampling on a pileup and computes the list of elements to remove - * - * @param reads original list of records - * @param numElementsToRemove the number of records to remove - * @return the list of pileup elements TO REMOVE - */ - protected static List downsampleElements(final List reads, final int numElementsToRemove) { - // are there no elements to remove? - if ( numElementsToRemove == 0 ) - return Collections.emptyList(); - - final ArrayList elementsToRemove = new ArrayList(numElementsToRemove); - final int originalElementCount = reads.size(); - - // should we remove all of the elements? - if ( numElementsToRemove >= originalElementCount ) { - elementsToRemove.addAll(reads); - return elementsToRemove; - } - - // create a bitset describing which elements to remove - final BitSet itemsToRemove = new BitSet(originalElementCount); - for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) { - itemsToRemove.set(selectedIndex); - } - - int currentBitSetIndex = 0; - for ( final GATKSAMRecord read : reads ) { - if ( itemsToRemove.get(currentBitSetIndex++) ) - elementsToRemove.add(read); - } - - return elementsToRemove; - } - - /** - * Create sample-contamination maps from file - * - * @param ContaminationFractionFile Filename containing two columns: SampleID and Contamination - * @param AvailableSampleIDs Set of Samples of interest (no reason to include every sample in file) or null to turn off checking - * @param logger for logging output - * @return sample-contamination Map - */ - - public static DefaultHashMap loadContaminationFile(File ContaminationFractionFile, final Double defaultContaminationFraction, final Set AvailableSampleIDs, Logger logger) throws GATKException { - DefaultHashMap sampleContamination = new DefaultHashMap(defaultContaminationFraction); - Set nonSamplesInContaminationFile = new HashSet(sampleContamination.keySet()); - try { - - XReadLines reader = new XReadLines(ContaminationFractionFile, true); - for (String line : reader) { - - if (line.length() == 0) { - continue; - } - - StringTokenizer st = new StringTokenizer(line,"\t"); - - String fields[] = new String[2]; - try { - fields[0] = st.nextToken(); - fields[1] = st.nextToken(); - } catch(NoSuchElementException e){ - throw new UserException.MalformedFile("Contamination file must have exactly two, tab-delimited columns. Offending line:\n" + line); - } - if(st.hasMoreTokens()) { - throw new UserException.MalformedFile("Contamination file must have exactly two, tab-delimited columns. Offending line:\n" + line); - } - - if (fields[0].length() == 0 || fields[1].length() == 0) { - throw new UserException.MalformedFile("Contamination file can not have empty strings in either column. Offending line:\n" + line); - } - - if (sampleContamination.containsKey(fields[0])) { - throw new UserException.MalformedFile("Contamination file contains duplicate entries for input name " + fields[0]); - } - - try { - final Double contamination = Double.valueOf(fields[1]); - if (contamination < 0 || contamination > 1){ - throw new UserException.MalformedFile("Contamination file contains unacceptable contamination value (must be 0<=x<=1): " + line); - } - if (AvailableSampleIDs==null || AvailableSampleIDs.contains(fields[0])) {// only add samples if they are in the sampleSet (or if it is null) - sampleContamination.put(fields[0], contamination); - } - else { - nonSamplesInContaminationFile.add(fields[0]); - } - } catch (NumberFormatException e) { - throw new UserException.MalformedFile("Contamination file contains unparsable double in the second field. Offending line: " + line); - } - } - - - //output to the user info lines telling which samples are in the Contamination File - if (sampleContamination.size() > 0) { - logger.info(String.format("The following samples were found in the Contamination file and will be processed at the contamination level therein: %s", sampleContamination.keySet().toString())); - - //output to the user info lines telling which samples are NOT in the Contamination File - if(AvailableSampleIDs!=null){ - Set samplesNotInContaminationFile = new HashSet(AvailableSampleIDs); - samplesNotInContaminationFile.removeAll(sampleContamination.keySet()); - if (samplesNotInContaminationFile.size() > 0) - logger.info(String.format("The following samples were NOT found in the Contamination file and will be processed at the default contamination level: %s", samplesNotInContaminationFile.toString())); - } - } - - //output to the user Samples that do not have lines in the Contamination File - if (nonSamplesInContaminationFile.size() > 0) { - logger.info(String.format("The following entries were found in the Contamination file but were not SAMPLEIDs. They will be ignored: %s", nonSamplesInContaminationFile.toString())); - } - - return sampleContamination; - - } catch (IOException e) { - throw new GATKException("I/O Error while reading sample-contamination file " + ContaminationFractionFile.getName() + ": " + e.getMessage()); - } - - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsampleType.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsampleType.java deleted file mode 100644 index 715ef6eed..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsampleType.java +++ /dev/null @@ -1,39 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -/** - * Type of downsampling method to invoke. - * - * @author hanna - * @version 0.1 - */ - -public enum DownsampleType { - NONE, - ALL_READS, - BY_SAMPLE -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/Downsampler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/Downsampler.java deleted file mode 100644 index 8ab0198b1..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/Downsampler.java +++ /dev/null @@ -1,161 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import java.util.Collection; -import java.util.List; - -/** - * The basic downsampler API, with no reads-specific operations. - * - * Downsamplers that extend this class rather than the ReadsDownsampler class can handle - * any kind of item, however they cannot be wrapped within a DownsamplingReadsIterator or a - * PerSampleDownsamplingReadsIterator. - * - * @author David Roazen - */ -public abstract class Downsampler { - - /** - * Number of items discarded by this downsampler since the last call to resetStats() - */ - protected int numDiscardedItems = 0; - - /** - * Submit one item to the downsampler for consideration. Some downsamplers will be able to determine - * immediately whether the item survives the downsampling process, while others will need to see - * more items before making that determination. - * - * @param item the individual item to submit to the downsampler for consideration - */ - public abstract void submit( final T item ); - - /** - * Submit a collection of items to the downsampler for consideration. Should be equivalent to calling - * submit() on each individual item in the collection. - * - * @param items the collection of items to submit to the downsampler for consideration - */ - public void submit( final Collection items ) { - if ( items == null ) { - throw new IllegalArgumentException("submitted items must not be null"); - } - - for ( final T item : items ) { - submit(item); - } - } - - /** - * Are there items that have survived the downsampling process waiting to be retrieved? - * - * @return true if this downsampler has > 0 finalized items, otherwise false - */ - public abstract boolean hasFinalizedItems(); - - /** - * Return (and *remove*) all items that have survived downsampling and are waiting to be retrieved. - * - * @return a list of all finalized items this downsampler contains, or an empty list if there are none - */ - public abstract List consumeFinalizedItems(); - - /** - * Are there items stored in this downsampler that it doesn't yet know whether they will - * ultimately survive the downsampling process? - * - * @return true if this downsampler has > 0 pending items, otherwise false - */ - public abstract boolean hasPendingItems(); - - /** - * Peek at the first finalized item stored in this downsampler (or null if there are no finalized items) - * - * @return the first finalized item in this downsampler (the item is not removed from the downsampler by this call), - * or null if there are none - */ - public abstract T peekFinalized(); - - /** - * Peek at the first pending item stored in this downsampler (or null if there are no pending items) - * - * @return the first pending item stored in this downsampler (the item is not removed from the downsampler by this call), - * or null if there are none - */ - public abstract T peekPending(); - - /** - * Get the current number of items in this downsampler - * - * This should be the best estimate of the total number of elements that will come out of the downsampler - * were consumeFinalizedItems() to be called immediately after this call. In other words it should - * be number of finalized items + estimate of number of pending items that will ultimately be included as well. - * - * @return a positive integer - */ - public abstract int size(); - - /** - * Returns the number of items discarded (so far) during the downsampling process - * - * @return the number of items that have been submitted to this downsampler and discarded in the process of - * downsampling - */ - public int getNumberOfDiscardedItems() { - return numDiscardedItems; - } - - /** - * Used to tell the downsampler that no more items will be submitted to it, and that it should - * finalize any pending items. - */ - public abstract void signalEndOfInput(); - - /** - * Empty the downsampler of all finalized/pending items - */ - public abstract void clearItems(); - - /** - * Reset stats in the downsampler such as the number of discarded items *without* clearing the downsampler of items - */ - public void resetStats() { - numDiscardedItems = 0; - } - - /** - * Indicates whether an item should be excluded from elimination during downsampling. By default, - * all items representing reduced reads are excluded from downsampling, but individual downsamplers - * may override if they are able to handle reduced reads correctly. Downsamplers should check - * the return value of this method before discarding an item. - * - * @param item The item to test - * @return true if the item should not be subject to elimination during downsampling, otherwise false - */ - protected boolean doNotDiscardItem( final Object item ) { - return false; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingMethod.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingMethod.java deleted file mode 100644 index 94a3cc74b..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingMethod.java +++ /dev/null @@ -1,142 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import org.broadinstitute.gatk.engine.walkers.ActiveRegionWalker; -import org.broadinstitute.gatk.engine.walkers.LocusWalker; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -/** - * Describes the method for downsampling reads at a given locus. - */ - -public class DownsamplingMethod { - /** - * Type of downsampling to perform. - */ - public final DownsampleType type; - - /** - * Actual downsampling target is specified as an integer number of reads. - */ - public final Integer toCoverage; - - /** - * Actual downsampling target is specified as a fraction of total available reads. - */ - public final Double toFraction; - - /** - * Expresses no downsampling applied at all. - */ - public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE, null, null); - - /** - * Default type to use if no type is specified - */ - public static final DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE; - - /** - * Don't allow dcov values below this threshold for locus-based traversals (ie., Locus - * and ActiveRegion walkers), as they can result in problematic downsampling artifacts - */ - public static final int MINIMUM_SAFE_COVERAGE_TARGET_FOR_LOCUS_BASED_TRAVERSALS = 200; - - - public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction ) { - this.type = type != null ? type : DEFAULT_DOWNSAMPLING_TYPE; - - if ( type == DownsampleType.NONE ) { - this.toCoverage = null; - this.toFraction = null; - } - else { - this.toCoverage = toCoverage; - this.toFraction = toFraction; - } - - validate(); - } - - private void validate() { - // Can't leave toFraction and toCoverage null unless type is NONE - if ( type != DownsampleType.NONE && toFraction == null && toCoverage == null ) - throw new UserException("Must specify either toFraction or toCoverage when downsampling."); - - // Fraction and coverage cannot both be specified. - if ( toFraction != null && toCoverage != null ) - throw new UserException("Downsampling coverage and fraction are both specified. Please choose only one."); - - // toCoverage must be > 0 when specified - if ( toCoverage != null && toCoverage <= 0 ) { - throw new UserException("toCoverage must be > 0 when downsampling to coverage"); - } - - // toFraction must be >= 0.0 and <= 1.0 when specified - if ( toFraction != null && (toFraction < 0.0 || toFraction > 1.0) ) { - throw new UserException("toFraction must be >= 0.0 and <= 1.0 when downsampling to a fraction of reads"); - } - } - - public void checkCompatibilityWithWalker( Walker walker ) { - boolean isLocusTraversal = walker instanceof LocusWalker || walker instanceof ActiveRegionWalker; - - if ( isLocusTraversal && type == DownsampleType.ALL_READS && toCoverage != null ) { - throw new UserException("Downsampling to coverage with the ALL_READS method for locus-based traversals (eg., LocusWalkers) is not currently supported (though it is supported for ReadWalkers)."); - } - - // For locus traversals, ensure that the dcov value (if present) is not problematically low - if ( isLocusTraversal && type != DownsampleType.NONE && toCoverage != null && - toCoverage < MINIMUM_SAFE_COVERAGE_TARGET_FOR_LOCUS_BASED_TRAVERSALS ) { - throw new UserException(String.format("Locus-based traversals (ie., Locus and ActiveRegion walkers) require " + - "a minimum -dcov value of %d when downsampling to coverage. Values less " + - "than this can produce problematic downsampling artifacts while providing " + - "only insignificant improvements in memory usage in most cases.", - MINIMUM_SAFE_COVERAGE_TARGET_FOR_LOCUS_BASED_TRAVERSALS)); - } - } - - public String toString() { - StringBuilder builder = new StringBuilder("Downsampling Settings: "); - - if ( type == DownsampleType.NONE ) { - builder.append("No downsampling"); - } - else { - builder.append(String.format("Method: %s, ", type)); - - if ( toCoverage != null ) { - builder.append(String.format("Target Coverage: %d", toCoverage)); - } - else { - builder.append(String.format("Target Fraction: %.2f", toFraction)); - } - } - - return builder.toString(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingReadsIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingReadsIterator.java deleted file mode 100644 index 6b398aba2..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingReadsIterator.java +++ /dev/null @@ -1,116 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; - -import java.util.Collection; -import java.util.Iterator; -import java.util.NoSuchElementException; - - -/** - * GATKSAMIterator wrapper around our generic reads downsampler interface. Converts the push-style - * downsampler interface to a pull model. - * - * @author David Roazen - */ -public class DownsamplingReadsIterator implements GATKSAMIterator { - - private GATKSAMIterator nestedSAMIterator; - private ReadsDownsampler downsampler; - private Collection downsampledReadsCache; - private SAMRecord nextRead = null; - private Iterator downsampledReadsCacheIterator = null; - - /** - * @param iter wrapped iterator from which this iterator will pull reads - * @param downsampler downsampler through which the reads will be fed - */ - public DownsamplingReadsIterator( GATKSAMIterator iter, ReadsDownsampler downsampler ) { - nestedSAMIterator = iter; - this.downsampler = downsampler; - - advanceToNextRead(); - } - - public boolean hasNext() { - return nextRead != null; - } - - public SAMRecord next() { - if ( nextRead == null ) { - throw new NoSuchElementException("next() called when there are no more items"); - } - - SAMRecord toReturn = nextRead; - advanceToNextRead(); - - return toReturn; - } - - private void advanceToNextRead() { - if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) { - nextRead = null; - } - else { - nextRead = downsampledReadsCacheIterator.next(); - } - } - - private boolean readyToReleaseReads() { - return downsampledReadsCacheIterator != null && downsampledReadsCacheIterator.hasNext(); - } - - private boolean fillDownsampledReadsCache() { - while ( nestedSAMIterator.hasNext() && ! downsampler.hasFinalizedItems() ) { - downsampler.submit(nestedSAMIterator.next()); - } - - if ( ! nestedSAMIterator.hasNext() ) { - downsampler.signalEndOfInput(); - } - - // use returned collection directly rather than make a copy, for speed - downsampledReadsCache = downsampler.consumeFinalizedItems(); - downsampledReadsCacheIterator = downsampledReadsCache.iterator(); - - return downsampledReadsCacheIterator.hasNext(); - } - - public void remove() { - throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); - } - - public void close() { - nestedSAMIterator.close(); - } - - public Iterator iterator() { - return this; - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingUtils.java deleted file mode 100644 index bd236c0bc..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingUtils.java +++ /dev/null @@ -1,107 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.sam.ReadUtils; - -import java.util.*; - -/** - * Utilities for using the downsamplers for common tasks - * - * User: depristo - * Date: 3/6/13 - * Time: 4:26 PM - */ -public class DownsamplingUtils { - private DownsamplingUtils() { } - - /** - * Level the coverage of the reads in each sample to no more than downsampleTo reads, no reducing - * coverage at any read start to less than minReadsPerAlignmentStart - * - * This algorithm can be used to handle the situation where you have lots of coverage in some interval, and - * want to reduce the coverage of the big peak down without removing the many reads at the edge of this - * interval that are in fact good - * - * This algorithm separately operates on the reads for each sample independently. - * - * @param reads a sorted list of reads - * @param downsampleTo the targeted number of reads we want from reads per sample - * @param minReadsPerAlignmentStart don't reduce the number of reads starting at a specific alignment start - * to below this. That is, if this value is 2, we'll never reduce the number - * of reads starting at a specific start site to less than 2 - * @return a sorted list of reads - */ - public static List levelCoverageByPosition(final List reads, final int downsampleTo, final int minReadsPerAlignmentStart) { - if ( reads == null ) throw new IllegalArgumentException("reads must not be null"); - - final List downsampled = new ArrayList(reads.size()); - - final Map>> readsBySampleByStart = partitionReadsBySampleAndStart(reads); - for ( final Map> readsByPosMap : readsBySampleByStart.values() ) { - final LevelingDownsampler, GATKSAMRecord> downsampler = new LevelingDownsampler, GATKSAMRecord>(downsampleTo, minReadsPerAlignmentStart); - downsampler.submit(readsByPosMap.values()); - downsampler.signalEndOfInput(); - for ( final List downsampledReads : downsampler.consumeFinalizedItems()) - downsampled.addAll(downsampledReads); - } - - return ReadUtils.sortReadsByCoordinate(downsampled); - } - - /** - * Build the data structure mapping for each sample -> (position -> reads at position) - * - * Note that the map position -> reads isn't ordered in any meaningful way - * - * @param reads a list of sorted reads - * @return a map containing the list of reads at each start location, for each sample independently - */ - private static Map>> partitionReadsBySampleAndStart(final List reads) { - final Map>> readsBySampleByStart = new LinkedHashMap>>(); - - for ( final GATKSAMRecord read : reads ) { - Map> readsByStart = readsBySampleByStart.get(read.getReadGroup().getSample()); - - if ( readsByStart == null ) { - readsByStart = new LinkedHashMap>(); - readsBySampleByStart.put(read.getReadGroup().getSample(), readsByStart); - } - - List readsAtStart = readsByStart.get(read.getAlignmentStart()); - if ( readsAtStart == null ) { - readsAtStart = new LinkedList(); - readsByStart.put(read.getAlignmentStart(), readsAtStart); - } - - readsAtStart.add(read); - } - - return readsBySampleByStart; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/FractionalDownsampler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/FractionalDownsampler.java deleted file mode 100644 index a2d613c5f..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/FractionalDownsampler.java +++ /dev/null @@ -1,129 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.util.ArrayList; -import java.util.List; - -/** - * Fractional Downsampler: selects a specified fraction of the reads for inclusion. - * - * Since the selection is done randomly, the actual fraction of reads retained may be slightly - * more or less than the requested fraction, depending on the total number of reads submitted. - * - * @author David Roazen - */ -public class FractionalDownsampler extends ReadsDownsampler { - - private ArrayList selectedReads; - - private final int cutoffForInclusion; - - private static final int RANDOM_POOL_SIZE = 10000; - - /** - * Construct a FractionalDownsampler - * - * @param fraction Fraction of reads to preserve, between 0.0 (inclusive) and 1.0 (inclusive). - * Actual number of reads preserved may differ randomly. - */ - public FractionalDownsampler( final double fraction ) { - if ( fraction < 0.0 || fraction > 1.0 ) { - throw new ReviewedGATKException("Fraction of reads to include must be between 0.0 and 1.0, inclusive"); - } - - cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE); - clearItems(); - resetStats(); - } - - @Override - public void submit( final T newRead ) { - if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion || doNotDiscardItem(newRead) ) { - selectedReads.add(newRead); - } - else { - numDiscardedItems++; - } - } - - @Override - public boolean hasFinalizedItems() { - return selectedReads.size() > 0; - } - - @Override - public List consumeFinalizedItems() { - // pass by reference rather than make a copy, for speed - List downsampledItems = selectedReads; - clearItems(); - return downsampledItems; - } - - @Override - public boolean hasPendingItems() { - return false; - } - - @Override - public T peekFinalized() { - return selectedReads.isEmpty() ? null : selectedReads.get(0); - } - - @Override - public T peekPending() { - return null; - } - - @Override - public int size() { - return selectedReads.size(); - } - - @Override - public void signalEndOfInput() { - // NO-OP - } - - @Override - public void clearItems() { - selectedReads = new ArrayList(); - } - - @Override - public boolean requiresCoordinateSortOrder() { - return false; - } - - @Override - public void signalNoMoreReadsBefore( final T read ) { - // NO-OP - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/FractionalDownsamplerFactory.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/FractionalDownsamplerFactory.java deleted file mode 100644 index 4ddf8dd87..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/FractionalDownsamplerFactory.java +++ /dev/null @@ -1,46 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMRecord; - -/** - * Factory for creating FractionalDownsamplers on demand - * - * @author David Roazen - */ -public class FractionalDownsamplerFactory implements ReadsDownsamplerFactory { - - private double fraction; - - public FractionalDownsamplerFactory( double fraction ) { - this.fraction = fraction; - } - - public ReadsDownsampler newInstance() { - return new FractionalDownsampler(fraction); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/LevelingDownsampler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/LevelingDownsampler.java deleted file mode 100644 index 4ae7bc581..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/LevelingDownsampler.java +++ /dev/null @@ -1,242 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import org.broadinstitute.gatk.utils.MathUtils; - -import java.util.*; - -/** - * Leveling Downsampler: Given a set of Lists of arbitrary items and a target size, removes items from - * the Lists in an even fashion until the total size of all Lists is <= the target size. Leveling - * does not occur until all Lists have been submitted and signalEndOfInput() is called. - * - * The Lists should be LinkedLists for maximum efficiency during item removal, however other - * kinds of Lists are also accepted (albeit at a slight performance penalty). - * - * Since this downsampler extends the Downsampler interface rather than the ReadsDownsampler interface, - * the Lists need not contain reads. However this downsampler may not be wrapped within one of the - * DownsamplingReadsIterators - * - * @param the List type representing the stacks to be leveled - * @param the type of the elements of each List - * - * @author David Roazen - */ -public class LevelingDownsampler, E> extends Downsampler { - private final int minElementsPerStack; - - private final int targetSize; - - private List groups; - - private boolean groupsAreFinalized; - - /** - * Construct a LevelingDownsampler - * - * Uses the default minElementsPerStack of 1 - * - * @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed - * this value -- if it does, items are removed from Lists evenly until the total size - * is <= this value - */ - public LevelingDownsampler( final int targetSize ) { - this(targetSize, 1); - } - - /** - * Construct a LevelingDownsampler - * - * @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed - * this value -- if it does, items are removed from Lists evenly until the total size - * is <= this value - * @param minElementsPerStack no stack will be reduced below this size during downsampling. That is, - * if a stack has only 3 elements and minElementsPerStack is 3, no matter what - * we'll not reduce this stack below 3. - */ - public LevelingDownsampler( final int targetSize, final int minElementsPerStack ) { - if ( targetSize < 0 ) throw new IllegalArgumentException("targetSize must be >= 0 but got " + targetSize); - if ( minElementsPerStack < 0 ) throw new IllegalArgumentException("minElementsPerStack must be >= 0 but got " + minElementsPerStack); - - this.targetSize = targetSize; - this.minElementsPerStack = minElementsPerStack; - clearItems(); - resetStats(); - } - - @Override - public void submit( final T item ) { - groups.add(item); - } - - @Override - public void submit( final Collection items ){ - groups.addAll(items); - } - - @Override - public boolean hasFinalizedItems() { - return groupsAreFinalized && groups.size() > 0; - } - - @Override - public List consumeFinalizedItems() { - if ( ! hasFinalizedItems() ) { - return new ArrayList(); - } - - // pass by reference rather than make a copy, for speed - final List toReturn = groups; - clearItems(); - return toReturn; - } - - @Override - public boolean hasPendingItems() { - return ! groupsAreFinalized && groups.size() > 0; - } - - @Override - public T peekFinalized() { - return hasFinalizedItems() ? groups.get(0) : null; - } - - @Override - public T peekPending() { - return hasPendingItems() ? groups.get(0) : null; - } - - @Override - public int size() { - int s = 0; - for ( final List l : groups ) { - s += l.size(); - } - return s; - } - - @Override - public void signalEndOfInput() { - levelGroups(); - groupsAreFinalized = true; - } - - @Override - public void clearItems() { - groups = new ArrayList(); - groupsAreFinalized = false; - } - - private void levelGroups() { - final int[] groupSizes = new int[groups.size()]; - int totalSize = 0; - int currentGroupIndex = 0; - - for ( final T group : groups ) { - groupSizes[currentGroupIndex] = group.size(); - totalSize += groupSizes[currentGroupIndex]; - currentGroupIndex++; - } - - if ( totalSize <= targetSize ) { - return; // no need to eliminate any items - } - - // We will try to remove exactly this many items, however we will refuse to allow any - // one group to fall below size 1, and so might end up removing fewer items than this - int numItemsToRemove = totalSize - targetSize; - - currentGroupIndex = 0; - int numConsecutiveUmodifiableGroups = 0; - - // Continue until we've either removed all the items we wanted to, or we can't - // remove any more items without violating the constraint that all groups must - // be left with at least one item - while ( numItemsToRemove > 0 && numConsecutiveUmodifiableGroups < groupSizes.length ) { - if ( groupSizes[currentGroupIndex] > minElementsPerStack ) { - groupSizes[currentGroupIndex]--; - numItemsToRemove--; - numConsecutiveUmodifiableGroups = 0; - } - else { - numConsecutiveUmodifiableGroups++; - } - - currentGroupIndex = (currentGroupIndex + 1) % groupSizes.length; - } - - // Now we actually go through and reduce each group to its new count as specified in groupSizes - currentGroupIndex = 0; - for ( final T group : groups ) { - downsampleOneGroup(group, groupSizes[currentGroupIndex]); - currentGroupIndex++; - } - } - - private void downsampleOneGroup( final T group, final int numItemsToKeep ) { - if ( numItemsToKeep >= group.size() ) { - return; - } - - final BitSet itemsToKeep = new BitSet(group.size()); - for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(group.size(), numItemsToKeep) ) { - itemsToKeep.set(selectedIndex); - } - - int currentIndex = 0; - - // If our group is a linked list, we can remove the desired items in a single O(n) pass with an iterator - if ( group instanceof LinkedList ) { - final Iterator iter = group.iterator(); - while ( iter.hasNext() ) { - final E item = iter.next(); - - if ( ! itemsToKeep.get(currentIndex) && ! doNotDiscardItem(item) ) { - iter.remove(); - numDiscardedItems++; - } - - currentIndex++; - } - } - // If it's not a linked list, it's more efficient to copy the desired items into a new list and back rather - // than suffer O(n^2) of item shifting - else { - final List keptItems = new ArrayList(group.size()); - - for ( final E item : group ) { - if ( itemsToKeep.get(currentIndex) || doNotDiscardItem(item) ) { - keptItems.add(item); - } - currentIndex++; - } - numDiscardedItems += group.size() - keptItems.size(); - group.clear(); - group.addAll(keptItems); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/PassThroughDownsampler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/PassThroughDownsampler.java deleted file mode 100644 index a5fdf24a9..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/PassThroughDownsampler.java +++ /dev/null @@ -1,111 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMRecord; - -import java.util.LinkedList; -import java.util.List; - -/** - * Pass-Through Downsampler: Implementation of the ReadsDownsampler interface that does no - * downsampling whatsoever, and instead simply "passes-through" all the reads it's given. - * Useful for situations where you want to disable downsampling, but still need to use - * the downsampler interface. - * - * @author David Roazen - */ -public class PassThroughDownsampler extends ReadsDownsampler { - - private LinkedList selectedReads; - - public PassThroughDownsampler() { - clearItems(); - } - - @Override - public void submit( T newRead ) { - // All reads pass-through, no reads get downsampled - selectedReads.add(newRead); - } - - @Override - public boolean hasFinalizedItems() { - return ! selectedReads.isEmpty(); - } - - /** - * Note that this list is a linked list and so doesn't support fast random access - * @return - */ - @Override - public List consumeFinalizedItems() { - // pass by reference rather than make a copy, for speed - final List downsampledItems = selectedReads; - clearItems(); - return downsampledItems; - } - - @Override - public boolean hasPendingItems() { - return false; - } - - @Override - public T peekFinalized() { - return selectedReads.isEmpty() ? null : selectedReads.getFirst(); - } - - @Override - public T peekPending() { - return null; - } - - @Override - public int size() { - return selectedReads.size(); - } - - @Override - public void signalEndOfInput() { - // NO-OP - } - - @Override - public void clearItems() { - selectedReads = new LinkedList(); - } - - @Override - public boolean requiresCoordinateSortOrder() { - return false; - } - - @Override - public void signalNoMoreReadsBefore( T read ) { - // NO-OP - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/PerSampleDownsamplingReadsIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/PerSampleDownsamplingReadsIterator.java deleted file mode 100644 index 118bbbbeb..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/PerSampleDownsamplingReadsIterator.java +++ /dev/null @@ -1,207 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.SAMRecordComparator; -import htsjdk.samtools.SAMRecordCoordinateComparator; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; - -import java.util.*; - - -/** - * GATKSAMIterator wrapper around our generic reads downsampler interface - * that downsamples reads for each sample independently, and then re-assembles - * the reads back into a single merged stream. - * - * @author David Roazen - */ -public class PerSampleDownsamplingReadsIterator implements GATKSAMIterator { - - private GATKSAMIterator nestedSAMIterator; - private ReadsDownsamplerFactory downsamplerFactory; - private Map> perSampleDownsamplers; - private PriorityQueue orderedDownsampledReadsCache; - private SAMRecord nextRead = null; - private SAMRecordComparator readComparator = new SAMRecordCoordinateComparator(); - private SAMRecord earliestPendingRead = null; - private ReadsDownsampler earliestPendingDownsampler = null; - - // Initial size of our cache of finalized reads - private static final int DOWNSAMPLED_READS_INITIAL_CACHE_SIZE = 4096; - - // The number of positional changes that can occur in the read stream before all downsamplers - // should be informed of the current position (guards against samples with relatively sparse reads - // getting stuck in a pending state): - private static final int DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL = 3; // TODO: experiment with this value - - /** - * @param iter wrapped iterator from which this iterator will pull reads - * @param downsamplerFactory factory used to create new downsamplers as needed - */ - public PerSampleDownsamplingReadsIterator( GATKSAMIterator iter, ReadsDownsamplerFactory downsamplerFactory ) { - nestedSAMIterator = iter; - this.downsamplerFactory = downsamplerFactory; - perSampleDownsamplers = new HashMap>(); - orderedDownsampledReadsCache = new PriorityQueue(DOWNSAMPLED_READS_INITIAL_CACHE_SIZE, readComparator); - - advanceToNextRead(); - } - - public boolean hasNext() { - return nextRead != null; - } - - public SAMRecord next() { - if ( nextRead == null ) { - throw new NoSuchElementException("next() called when there are no more items"); - } - - SAMRecord toReturn = nextRead; - advanceToNextRead(); - - return toReturn; - } - - private void advanceToNextRead() { - if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) { - nextRead = null; - } - else { - nextRead = orderedDownsampledReadsCache.poll(); - } - } - - private boolean readyToReleaseReads() { - if ( orderedDownsampledReadsCache.isEmpty() ) { - return false; - } - - return earliestPendingRead == null || - readComparator.compare(orderedDownsampledReadsCache.peek(), earliestPendingRead) <= 0; - } - - private boolean fillDownsampledReadsCache() { - SAMRecord prevRead = null; - int numPositionalChanges = 0; - - // Continue submitting reads to the per-sample downsamplers until the read at the top of the priority queue - // can be released without violating global sort order - while ( nestedSAMIterator.hasNext() && ! readyToReleaseReads() ) { - SAMRecord read = nestedSAMIterator.next(); - String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - - ReadsDownsampler thisSampleDownsampler = perSampleDownsamplers.get(sampleName); - if ( thisSampleDownsampler == null ) { - thisSampleDownsampler = downsamplerFactory.newInstance(); - perSampleDownsamplers.put(sampleName, thisSampleDownsampler); - } - - thisSampleDownsampler.submit(read); - processFinalizedAndPendingItems(thisSampleDownsampler); - - if ( prevRead != null && prevRead.getAlignmentStart() != read.getAlignmentStart() ) { - numPositionalChanges++; - } - - // Periodically inform all downsamplers of the current position in the read stream. This is - // to prevent downsamplers for samples with sparser reads than others from getting stuck too - // long in a pending state. - if ( numPositionalChanges > 0 && numPositionalChanges % DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL == 0 ) { - for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { - perSampleDownsampler.signalNoMoreReadsBefore(read); - processFinalizedAndPendingItems(perSampleDownsampler); - } - } - - prevRead = read; - } - - if ( ! nestedSAMIterator.hasNext() ) { - for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { - perSampleDownsampler.signalEndOfInput(); - if ( perSampleDownsampler.hasFinalizedItems() ) { - orderedDownsampledReadsCache.addAll(perSampleDownsampler.consumeFinalizedItems()); - } - } - earliestPendingRead = null; - earliestPendingDownsampler = null; - } - - return readyToReleaseReads(); - } - - private void updateEarliestPendingRead( ReadsDownsampler currentDownsampler ) { - // If there is no recorded earliest pending read and this downsampler has pending items, - // then this downsampler's first pending item becomes the new earliest pending read: - if ( earliestPendingRead == null && currentDownsampler.hasPendingItems() ) { - earliestPendingRead = currentDownsampler.peekPending(); - earliestPendingDownsampler = currentDownsampler; - } - // In all other cases, we only need to update the earliest pending read when the downsampler - // associated with it experiences a change in its pending reads, since by assuming a sorted - // read stream we're assured that each downsampler's earliest pending read will only increase - // in genomic position over time. - // - // TODO: An occasional O(samples) linear search seems like a better option than keeping the downsamplers - // TODO: sorted by earliest pending read, which would cost at least O(total_reads * (samples + log(samples))), - // TODO: but need to verify this empirically. - else if ( currentDownsampler == earliestPendingDownsampler && - (! currentDownsampler.hasPendingItems() || readComparator.compare(currentDownsampler.peekPending(), earliestPendingRead) != 0) ) { - - earliestPendingRead = null; - earliestPendingDownsampler = null; - for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { - if ( perSampleDownsampler.hasPendingItems() && - (earliestPendingRead == null || readComparator.compare(perSampleDownsampler.peekPending(), earliestPendingRead) < 0) ) { - - earliestPendingRead = perSampleDownsampler.peekPending(); - earliestPendingDownsampler = perSampleDownsampler; - } - } - } - } - - private void processFinalizedAndPendingItems( ReadsDownsampler currentDownsampler ) { - if ( currentDownsampler.hasFinalizedItems() ) { - orderedDownsampledReadsCache.addAll(currentDownsampler.consumeFinalizedItems()); - } - updateEarliestPendingRead(currentDownsampler); - } - - public void remove() { - throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); - } - - public void close() { - nestedSAMIterator.close(); - } - - public Iterator iterator() { - return this; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReadsDownsampler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReadsDownsampler.java deleted file mode 100644 index 9263920f9..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReadsDownsampler.java +++ /dev/null @@ -1,56 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMRecord; - -/** - * An extension of the basic downsampler API with reads-specific operations - * - * @author David Roazen - */ -public abstract class ReadsDownsampler extends Downsampler { - - /** - * Does this downsampler require that reads be fed to it in coordinate order? - * - * @return true if reads must be submitted to this downsampler in coordinate order, otherwise false - */ - public abstract boolean requiresCoordinateSortOrder(); - - /** - * Tell this downsampler that no more reads located before the provided read (according to - * the sort order of the read stream) will be fed to it. - * - * Allows position-aware downsamplers to finalize pending reads earlier than they would - * otherwise be able to, particularly when doing per-sample downsampling and reads for - * certain samples are sparser than average. - * - * @param read the downsampler will assume that no reads located before this read will ever - * be submitted to it in the future - */ - public abstract void signalNoMoreReadsBefore( final T read ); -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReadsDownsamplerFactory.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReadsDownsamplerFactory.java deleted file mode 100644 index 9ef847e67..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReadsDownsamplerFactory.java +++ /dev/null @@ -1,38 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMRecord; - -/** - * A ReadsDownsamplerFactory can be used to create an arbitrary number of instances of a particular - * downsampler, all sharing the same construction parameters. - * - * @author David Roazen - */ -public interface ReadsDownsamplerFactory { - public ReadsDownsampler newInstance(); -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReservoirDownsampler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReservoirDownsampler.java deleted file mode 100644 index 99a0bbd7a..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReservoirDownsampler.java +++ /dev/null @@ -1,219 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.util.*; - -/** - * Reservoir Downsampler: Selects n reads out of a stream whose size is not known in advance, with - * every read in the stream having an equal chance of being selected for inclusion. - * - * An implementation of "Algorithm R" from the paper "Random Sampling with a Reservoir" (Jeffrey Scott Vitter, 1985) - * - * @author David Roazen - */ -public class ReservoirDownsampler extends ReadsDownsampler { - - /** - * size of our reservoir -- ie., the maximum number of reads from the stream that will be retained - * (not including any undiscardable items) - */ - private final int targetSampleSize; - - /** - * if true, this downsampler will be optimized for the case - * where most of the time we won't fill up anything like the - * targetSampleSize elements. If this is false, we will allocate - * internal buffers to targetSampleSize initially, which minimizes - * the cost of allocation if we often use targetSampleSize or more - * elements. - */ - private final boolean expectFewOverflows; - - /** - * At times this can be a linked list or an array list, depending on how we're accessing the - * data and whether or not we're expecting few overflows - */ - private List reservoir; - - /** - * Certain items (eg., reduced reads) cannot be discarded at all during downsampling. We store - * these items separately so as not to impact the fair selection of items for inclusion in the - * reservoir. These items are returned (and cleared) along with any items in the reservoir in - * calls to consumeFinalizedItems(). - */ - private List undiscardableItems; - - /** - * Are we currently using a linked list for the reservoir? - */ - private boolean isLinkedList; - - /** - * Count of the number of reads seen that were actually eligible for discarding. Used by the reservoir downsampling - * algorithm to ensure that all discardable reads have an equal chance of making it into the reservoir. - */ - private int totalDiscardableReadsSeen; - - - /** - * Construct a ReservoirDownsampler - * - * @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained - * after downsampling will be min(totalDiscardableReads, targetSampleSize) + any - * undiscardable reads (eg., reduced reads). - * - * @param expectFewOverflows if true, this downsampler will be optimized for the case - * where most of the time we won't fill up anything like the - * targetSampleSize elements. If this is false, we will allocate - * internal buffers to targetSampleSize initially, which minimizes - * the cost of allocation if we often use targetSampleSize or more - * elements. - */ - public ReservoirDownsampler ( final int targetSampleSize, final boolean expectFewOverflows ) { - if ( targetSampleSize <= 0 ) { - throw new ReviewedGATKException("Cannot do reservoir downsampling with a sample size <= 0"); - } - - this.targetSampleSize = targetSampleSize; - this.expectFewOverflows = expectFewOverflows; - clearItems(); - resetStats(); - } - - /** - * Construct a ReservoirDownsampler - * - * @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained - * after downsampling will be min(totalReads, targetSampleSize) - */ - public ReservoirDownsampler ( final int targetSampleSize ) { - this(targetSampleSize, false); - } - - @Override - public void submit ( final T newRead ) { - if ( doNotDiscardItem(newRead) ) { - undiscardableItems.add(newRead); - return; - } - - // Only count reads that are actually eligible for discarding for the purposes of the reservoir downsampling algorithm - totalDiscardableReadsSeen++; - - if ( totalDiscardableReadsSeen <= targetSampleSize ) { - reservoir.add(newRead); - } - else { - if ( isLinkedList ) { - reservoir = new ArrayList(reservoir); - isLinkedList = false; - } - - final int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalDiscardableReadsSeen); - if ( randomSlot < targetSampleSize ) { - reservoir.set(randomSlot, newRead); - } - numDiscardedItems++; - } - } - - @Override - public boolean hasFinalizedItems() { - return ! reservoir.isEmpty() || ! undiscardableItems.isEmpty(); - } - - @Override - public List consumeFinalizedItems() { - if ( ! hasFinalizedItems() ) { - // if there's nothing here, don't bother allocating a new list - return Collections.emptyList(); - } else { - // pass reservoir by reference rather than make a copy, for speed - final List downsampledItems = reservoir; - downsampledItems.addAll(undiscardableItems); - clearItems(); - return downsampledItems; - } - } - - @Override - public boolean hasPendingItems() { - return false; - } - - @Override - public T peekFinalized() { - return ! reservoir.isEmpty() ? reservoir.get(0) : (! undiscardableItems.isEmpty() ? undiscardableItems.get(0) : null); - } - - @Override - public T peekPending() { - return null; - } - - @Override - public int size() { - return reservoir.size() + undiscardableItems.size(); - } - - @Override - public void signalEndOfInput() { - // NO-OP - } - - /** - * Clear the data structures used to hold information - */ - @Override - public void clearItems() { - // if we aren't expecting many overflows, allocate a linked list not an arraylist - reservoir = expectFewOverflows ? new LinkedList() : new ArrayList(targetSampleSize); - - // there's no possibility of overflow with the undiscardable items, so we always use a linked list for them - undiscardableItems = new LinkedList<>(); - - // it's a linked list if we allocate one - isLinkedList = expectFewOverflows; - - // an internal stat used by the downsampling process, so not cleared by resetStats() below - totalDiscardableReadsSeen = 0; - } - - @Override - public boolean requiresCoordinateSortOrder() { - return false; - } - - @Override - public void signalNoMoreReadsBefore( T read ) { - // NO-OP - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReservoirDownsamplerFactory.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReservoirDownsamplerFactory.java deleted file mode 100644 index c825bae1f..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReservoirDownsamplerFactory.java +++ /dev/null @@ -1,46 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMRecord; - -/** - * Factory for creating ReservoirDownsamplers on demand - * - * @author David Roazen - */ -public class ReservoirDownsamplerFactory implements ReadsDownsamplerFactory { - - private int targetSampleSize; - - public ReservoirDownsamplerFactory( int targetSampleSize ) { - this.targetSampleSize = targetSampleSize; - } - - public ReadsDownsampler newInstance() { - return new ReservoirDownsampler(targetSampleSize); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/SimplePositionalDownsampler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/SimplePositionalDownsampler.java deleted file mode 100644 index af0aa54c0..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/SimplePositionalDownsampler.java +++ /dev/null @@ -1,171 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMRecord; - -import java.util.*; - -/** - * Simple Positional Downsampler: Downsample each stack of reads at each alignment start to a size <= a target coverage - * using a Reservoir downsampler. Stores only O(target coverage) reads in memory at any given time. - * - * @author David Roazen - */ -public class SimplePositionalDownsampler extends ReadsDownsampler { - - private final int targetCoverage; - - private final ReservoirDownsampler reservoir; - - private int currentContigIndex; - - private int currentAlignmentStart; - - private boolean positionEstablished; - - private boolean unmappedReadsReached; - - private ArrayList finalizedReads; - - - /** - * Construct a SimplePositionalDownsampler - * - * @param targetCoverage Maximum number of reads that may share any given alignment start position - */ - public SimplePositionalDownsampler( final int targetCoverage ) { - this.targetCoverage = targetCoverage; - reservoir = new ReservoirDownsampler(targetCoverage); - finalizedReads = new ArrayList(); - clearItems(); - resetStats(); - } - - @Override - public void submit( final T newRead ) { - updatePositionalState(newRead); - - if ( unmappedReadsReached ) { // don't downsample the unmapped reads at the end of the stream - finalizedReads.add(newRead); - } - else { - final int reservoirPreviouslyDiscardedItems = reservoir.getNumberOfDiscardedItems(); - // our reservoir downsampler will call doNotDiscardItem() for us to exclude items from elimination as appropriate - reservoir.submit(newRead); - numDiscardedItems += reservoir.getNumberOfDiscardedItems() - reservoirPreviouslyDiscardedItems; - } - } - - @Override - public boolean hasFinalizedItems() { - return finalizedReads.size() > 0; - } - - @Override - public List consumeFinalizedItems() { - // pass by reference rather than make a copy, for speed - final List toReturn = finalizedReads; - finalizedReads = new ArrayList(); - return toReturn; - } - - @Override - public boolean hasPendingItems() { - return reservoir.hasFinalizedItems(); - } - - @Override - public T peekFinalized() { - return finalizedReads.isEmpty() ? null : finalizedReads.get(0); - } - - @Override - public T peekPending() { - return reservoir.peekFinalized(); - } - - @Override - public int size() { - return finalizedReads.size() + reservoir.size(); - } - - @Override - public void signalEndOfInput() { - finalizeReservoir(); - } - - @Override - public void clearItems() { - reservoir.clearItems(); - reservoir.resetStats(); - finalizedReads.clear(); - positionEstablished = false; - unmappedReadsReached = false; - } - - @Override - public boolean requiresCoordinateSortOrder() { - return true; - } - - @Override - public void signalNoMoreReadsBefore( final T read ) { - updatePositionalState(read); - } - - private void updatePositionalState( final T newRead ) { - if ( readIsPastCurrentPosition(newRead) ) { - if ( reservoir.hasFinalizedItems() ) { - finalizeReservoir(); - } - - setCurrentPosition(newRead); - - if ( newRead.getReadUnmappedFlag() ) { - unmappedReadsReached = true; - } - } - } - - private void setCurrentPosition( final T read ) { - currentContigIndex = read.getReferenceIndex(); - currentAlignmentStart = read.getAlignmentStart(); - positionEstablished = true; - } - - private boolean readIsPastCurrentPosition( final T read ) { - return ! positionEstablished || - read.getReferenceIndex() > currentContigIndex || - read.getAlignmentStart() > currentAlignmentStart || - (read.getReadUnmappedFlag() && ! unmappedReadsReached); - } - - private void finalizeReservoir() { - finalizedReads.addAll(reservoir.consumeFinalizedItems()); - reservoir.resetStats(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/SimplePositionalDownsamplerFactory.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/SimplePositionalDownsamplerFactory.java deleted file mode 100644 index 3fc66cafe..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/SimplePositionalDownsamplerFactory.java +++ /dev/null @@ -1,46 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMRecord; - -/** - * Factory for creating SimplePositionalDownsamplers on demand - * - * @author David Roazen - */ -public class SimplePositionalDownsamplerFactory implements ReadsDownsamplerFactory { - - private int targetCoverage; - - public SimplePositionalDownsamplerFactory( int targetCoverage ) { - this.targetCoverage = targetCoverage; - } - - public ReadsDownsampler newInstance() { - return new SimplePositionalDownsampler(targetCoverage); - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/LinearMicroScheduler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/LinearMicroScheduler.java deleted file mode 100644 index 293bb1ce5..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/LinearMicroScheduler.java +++ /dev/null @@ -1,130 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.executive; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.datasources.providers.LocusShardDataProvider; -import org.broadinstitute.gatk.engine.datasources.providers.ReadShardDataProvider; -import org.broadinstitute.gatk.engine.datasources.providers.ShardDataProvider; -import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; -import org.broadinstitute.gatk.engine.datasources.reads.Shard; -import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.io.DirectOutputTracker; -import org.broadinstitute.gatk.engine.io.OutputTracker; -import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; -import org.broadinstitute.gatk.engine.traversals.TraversalEngine; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.utils.SampleUtils; -import org.broadinstitute.gatk.utils.threading.ThreadEfficiencyMonitor; - -import java.util.Collection; - - -/** A micro-scheduling manager for single-threaded execution of a traversal. */ -public class LinearMicroScheduler extends MicroScheduler { - - /** - * A direct output tracker for directly managing output. - */ - private DirectOutputTracker outputTracker = new DirectOutputTracker(); - - /** - * Create a new linear microscheduler to process the given reads and reference. - * - * @param walker Walker for the traversal. - * @param reads Reads file(s) to process. - * @param reference Reference for driving the traversal. - * @param rods Reference-ordered data. - */ - protected LinearMicroScheduler(final GenomeAnalysisEngine engine, - final Walker walker, - final SAMDataSource reads, - final IndexedFastaSequenceFile reference, - final Collection rods, - final ThreadAllocation threadAllocation) { - super(engine, walker, reads, reference, rods, threadAllocation); - - if ( threadAllocation.monitorThreadEfficiency() ) - setThreadEfficiencyMonitor(new ThreadEfficiencyMonitor()); - } - - /** - * Run this traversal over the specified subsection of the dataset. - * - * @param walker Computation to perform over dataset. - * @param shardStrategy A strategy for sharding the data. - */ - public Object execute(Walker walker, Iterable shardStrategy) { - super.startingExecution(); - walker.initialize(); - Accumulator accumulator = Accumulator.create(engine,walker); - - boolean done = walker.isDone(); - int counter = 0; - - final TraversalEngine traversalEngine = borrowTraversalEngine(this); - for (Shard shard : shardStrategy ) { - if ( abortExecution() || done || shard == null ) // we ran out of shards that aren't owned - break; - - if(shard.getShardType() == Shard.ShardType.LOCUS) { - WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), - getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine)); - for(WindowMaker.WindowMakerIterator iterator: windowMaker) { - ShardDataProvider dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),engine.getGenomeLocParser(),iterator.getLocus(),iterator,reference,rods); - Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); - accumulator.accumulate(dataProvider,result); - dataProvider.close(); - if ( walker.isDone() ) break; - } - windowMaker.close(); - } - else { - ShardDataProvider dataProvider = new ReadShardDataProvider(shard,engine.getGenomeLocParser(),getReadIterator(shard),reference,rods); - Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); - accumulator.accumulate(dataProvider,result); - dataProvider.close(); - } - - done = walker.isDone(); - } - - Object result = accumulator.finishTraversal(); - - outputTracker.close(); - returnTraversalEngine(this, traversalEngine); - cleanup(); - executionIsDone(); - - return accumulator; - } - - /** - * @{inheritDoc} - */ - public OutputTracker getOutputTracker() { return outputTracker; } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java deleted file mode 100644 index e192b9a72..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java +++ /dev/null @@ -1,463 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.executive; - -import com.google.java.contract.Ensures; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.ReadMetrics; -import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; -import org.broadinstitute.gatk.engine.datasources.reads.Shard; -import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.io.OutputTracker; -import org.broadinstitute.gatk.engine.iterators.NullSAMIterator; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; -import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; -import org.broadinstitute.gatk.engine.traversals.*; -import org.broadinstitute.gatk.engine.walkers.*; -import org.broadinstitute.gatk.utils.AutoFormattingTime; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.progressmeter.ProgressMeter; -import org.broadinstitute.gatk.utils.threading.ThreadEfficiencyMonitor; - -import javax.management.JMException; -import javax.management.MBeanServer; -import javax.management.ObjectName; -import java.io.File; -import java.lang.management.ManagementFactory; -import java.util.*; - - -/** - * Created by IntelliJ IDEA. - * User: mhanna - * Date: Apr 26, 2009 - * Time: 12:37:23 PM - * - * General base class for all scheduling algorithms - * Shards and schedules data in manageable chunks. - * - * Creates N TraversalEngines for each data thread for the MicroScheduler. This is necessary - * because in the HMS case you have multiple threads executing a traversal engine independently, and - * these engines may need to create separate resources for efficiency or implementation reasons. For example, - * the nanoScheduler creates threads to implement the traversal, and this creation is instance specific. - * So each HMS thread needs to have it's own distinct copy of the traversal engine if it wants to have - * N data threads x M nano threads => N * M threads total. These are borrowed from this microscheduler - * and returned when done. Also allows us to tracks all created traversal engines so this microscheduler - * can properly shut them all down when the scheduling is done. - * - */ -public abstract class MicroScheduler implements MicroSchedulerMBean { - protected static final Logger logger = Logger.getLogger(MicroScheduler.class); - - /** - * The list of all Traversal engines we've created in this micro scheduler - */ - final List allCreatedTraversalEngines = new LinkedList(); - - /** - * All available engines. Engines are borrowed and returned when a subclass is actually - * going to execute the engine on some data. This allows us to have N copies for - * N data parallel executions, but without the dangerous code of having local - * ThreadLocal variables. - */ - final LinkedList availableTraversalEngines = new LinkedList(); - - /** - * Engines that have been allocated to a key already. - */ - final HashMap allocatedTraversalEngines = new HashMap(); - - /** - * Counts the number of instances of the class that are currently alive. - */ - private static int instanceNumber = 0; - - /** - * The engine invoking this scheduler. - */ - protected final GenomeAnalysisEngine engine; - - protected final IndexedFastaSequenceFile reference; - - private final SAMDataSource reads; - protected final Collection rods; - - private final MBeanServer mBeanServer; - private final ObjectName mBeanName; - - /** - * Threading efficiency monitor for tracking the resource utilization of the GATK - * - * may be null - */ - ThreadEfficiencyMonitor threadEfficiencyMonitor = null; - - /** - * MicroScheduler factory function. Create a microscheduler appropriate for reducing the - * selected walker. - * - * @param walker Which walker to use. - * @param reads the informations associated with the reads - * @param reference the reference file - * @param rods the rods to include in the traversal - * @param threadAllocation Number of threads to utilize. - * - * @return The best-fit microscheduler. - */ - public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { - if ( threadAllocation.isRunningInParallelMode() ) { - logger.info(String.format("Running the GATK in parallel mode with %d total threads, " + - "%d CPU thread(s) for each of %d data thread(s), of %d processors available on this machine", - threadAllocation.getTotalNumThreads(), - threadAllocation.getNumCPUThreadsPerDataThread(), - threadAllocation.getNumDataThreads(), - Runtime.getRuntime().availableProcessors())); - if ( threadAllocation.getTotalNumThreads() > Runtime.getRuntime().availableProcessors() ) - logger.warn(String.format("Number of requested GATK threads %d is more than the number of " + - "available processors on this machine %d", threadAllocation.getTotalNumThreads(), - Runtime.getRuntime().availableProcessors())); - } - - if ( threadAllocation.getNumDataThreads() > 1 ) { - if (walker.isReduceByInterval()) - throw new UserException.BadArgumentValue("nt", String.format("This run of %s is set up to aggregate results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option or check if this tool has an option to disable per-interval calculations.", engine.getWalkerName(walker.getClass()))); - - if ( ! (walker instanceof TreeReducible) ) { - throw badNT("nt", engine, walker); - } - } - - if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof NanoSchedulable) ) { - throw badNT("nct", engine, walker); - } - - if ( threadAllocation.getNumDataThreads() > 1 ) { - return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); - } else { - return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); - } - } - - private static UserException badNT(final String parallelArg, final GenomeAnalysisEngine engine, final Walker walker) { - throw new UserException.BadArgumentValue(parallelArg, - String.format("The analysis %s currently does not support parallel execution with %s. " + - "Please run your analysis without the %s option.", engine.getWalkerName(walker.getClass()), parallelArg, parallelArg)); - } - - /** - * Create a microscheduler given the reads and reference. - * - * @param walker the walker to execute with - * @param reads The reads. - * @param reference The reference. - * @param rods the rods to include in the traversal - * @param threadAllocation the allocation of threads to use in the underlying traversal - */ - protected MicroScheduler(final GenomeAnalysisEngine engine, - final Walker walker, - final SAMDataSource reads, - final IndexedFastaSequenceFile reference, - final Collection rods, - final ThreadAllocation threadAllocation) { - this.engine = engine; - this.reads = reads; - this.reference = reference; - this.rods = rods; - - final File progressLogFile = engine.getArguments() == null ? null : engine.getArguments().performanceLog; - - // Creates uninitialized TraversalEngines appropriate for walker and threadAllocation, - // and adds it to the list of created engines for later shutdown. - for ( int i = 0; i < threadAllocation.getNumDataThreads(); i++ ) { - final TraversalEngine traversalEngine = createTraversalEngine(walker, threadAllocation); - allCreatedTraversalEngines.add(traversalEngine); - availableTraversalEngines.add(traversalEngine); - } - - // Create the progress meter, and register it with the analysis engine - engine.registerProgressMeter(new ProgressMeter(progressLogFile, - availableTraversalEngines.peek().getTraversalUnits(), - engine.getRegionsOfGenomeBeingProcessed())); - - // Now that we have a progress meter, go through and initialize the traversal engines - for ( final TraversalEngine traversalEngine : allCreatedTraversalEngines ) - traversalEngine.initialize(engine, walker, engine.getProgressMeter()); - - // JMX does not allow multiple instances with the same ObjectName to be registered with the same platform MXBean. - // To get around this limitation and since we have no job identifier at this point, register a simple counter that - // will count the number of instances of this object that have been created in this JVM. - int thisInstance = instanceNumber++; - mBeanServer = ManagementFactory.getPlatformMBeanServer(); - try { - mBeanName = new ObjectName("org.broadinstitute.gatk.engine.executive:type=MicroScheduler,instanceNumber="+thisInstance); - mBeanServer.registerMBean(this, mBeanName); - } - catch (JMException ex) { - throw new ReviewedGATKException("Unable to register microscheduler with JMX", ex); - } - } - - /** - * Really make us a traversal engine of the appropriate type for walker and thread allocation - * - * @return a non-null uninitialized traversal engine - */ - @Ensures("result != null") - private TraversalEngine createTraversalEngine(final Walker walker, final ThreadAllocation threadAllocation) { - if (walker instanceof ReadWalker) { - return new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread()); - } else if (walker instanceof LocusWalker) { - return new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread()); - } else if (walker instanceof DuplicateWalker) { - return new TraverseDuplicates(); - } else if (walker instanceof ReadPairWalker) { - return new TraverseReadPairs(); - } else if (walker instanceof ActiveRegionWalker) { - return new TraverseActiveRegions(threadAllocation.getNumCPUThreadsPerDataThread()); - } else { - throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type."); - } - } - - - /** - * Return the ThreadEfficiencyMonitor we are using to track our resource utilization, if there is one - * - * @return the monitor, or null if none is active - */ - public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() { - return threadEfficiencyMonitor; - } - - /** - * Inform this Microscheduler to use the efficiency monitor used to create threads in subclasses - * - * @param threadEfficiencyMonitor - */ - public void setThreadEfficiencyMonitor(final ThreadEfficiencyMonitor threadEfficiencyMonitor) { - this.threadEfficiencyMonitor = threadEfficiencyMonitor; - } - - /** - * Should we stop all execution work and exit gracefully? - * - * Returns true in the case where some external signal or time limit has been received, indicating - * that this GATK shouldn't continue executing. This isn't a kill signal, it is really a "shutdown - * gracefully at the next opportunity" signal. Concrete implementations of the MicroScheduler - * examine this value as often as reasonable and, if it returns true, stop what they are doing - * at the next available opportunity, shutdown their resources, call notify done, and return. - * - * @return true if we should abort execution, or false otherwise - */ - protected boolean abortExecution() { - final boolean abort = engine.exceedsRuntimeLimit(); - if ( abort ) { - final AutoFormattingTime aft = new AutoFormattingTime(engine.getRuntimeLimitInNanoseconds(), -1, 4); - logger.info("Aborting execution (cleanly) because the runtime has exceeded the requested maximum " + aft); - } - return abort; - } - - /** - * Walks a walker over the given list of intervals. - * - * @param walker Computation to perform over dataset. - * @param shardStrategy A strategy for sharding the data. - * - * @return the return type of the walker - */ - public abstract Object execute(Walker walker, Iterable shardStrategy); - - /** - * Tells this MicroScheduler that the execution of one of the subclass of this object as started - * - * Must be called when the implementation of execute actually starts up - * - * Currently only starts the progress meter timer running, but other start up activities could be incorporated - */ - protected void startingExecution() { - engine.getProgressMeter().start(); - } - - /** - * Retrieves the object responsible for tracking and managing output. - * @return An output tracker, for loading data in and extracting results. Will not be null. - */ - public abstract OutputTracker getOutputTracker(); - - /** - * Gets the an iterator over the given reads, which will iterate over the reads in the given shard. - * @param shard the shard to use when querying reads. - * @return an iterator over the reads specified in the shard. - */ - protected GATKSAMIterator getReadIterator(Shard shard) { - return (!reads.isEmpty()) ? reads.seek(shard) : new NullSAMIterator(); - } - - /** - * Must be called by subclasses when execute is done - */ - protected void executionIsDone() { - engine.getProgressMeter().notifyDone(engine.getCumulativeMetrics().getNumIterations()); - printReadFilteringStats(); - shutdownTraversalEngines(); - - // Print out the threading efficiency of this HMS, if state monitoring is enabled - if ( threadEfficiencyMonitor != null ) { - // include the master thread information - threadEfficiencyMonitor.threadIsDone(Thread.currentThread()); - threadEfficiencyMonitor.printUsageInformation(logger); - } - } - - /** - * Shutdown all of the created engines, and clear the list of created engines, dropping - * pointers to the traversal engines - */ - public synchronized void shutdownTraversalEngines() { - for ( final TraversalEngine te : allCreatedTraversalEngines) - te.shutdown(); - - allCreatedTraversalEngines.clear(); - availableTraversalEngines.clear(); - } - - /** - * Prints out information about number of reads observed and filtering, if any reads were used in the traversal - * - * Looks like: - * - * INFO 10:40:47,370 MicroScheduler - 22 reads were filtered out during traversal out of 101 total (21.78%) - * INFO 10:40:47,370 MicroScheduler - -> 1 reads (0.99% of total) failing BadMateFilter - * INFO 10:40:47,370 MicroScheduler - -> 20 reads (19.80% of total) failing DuplicateReadFilter - * INFO 10:40:47,370 MicroScheduler - -> 1 reads (0.99% of total) failing FailsVendorQualityCheckFilter - */ - private void printReadFilteringStats() { - final ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics(); - if ( cumulativeMetrics.getNumReadsSeen() > 0 ) { - // count up the number of skipped reads by summing over all filters - long nSkippedReads = 0L; - for ( final long countsByFilter : cumulativeMetrics.getCountsByFilter().values()) - nSkippedReads += countsByFilter; - - logger.info(String.format("%d reads were filtered out during the traversal out of approximately %d total reads (%.2f%%)", - nSkippedReads, - cumulativeMetrics.getNumReadsSeen(), - 100.0 * MathUtils.ratio(nSkippedReads, cumulativeMetrics.getNumReadsSeen()))); - - for ( final Map.Entry filterCounts : cumulativeMetrics.getCountsByFilter().entrySet() ) { - long count = filterCounts.getValue(); - logger.info(String.format(" -> %d reads (%.2f%% of total) failing %s", - count, 100.0 * MathUtils.ratio(count,cumulativeMetrics.getNumReadsSeen()), filterCounts.getKey())); - } - } - } - - /** - * Gets the engine that created this microscheduler. - * @return The engine owning this microscheduler. - */ - public GenomeAnalysisEngine getEngine() { return engine; } - - /** - * Returns data source maintained by this scheduler - * @return - */ - public SAMDataSource getSAMDataSource() { return reads; } - - /** - * Returns the reference maintained by this scheduler. - * @return The reference maintained by this scheduler. - */ - public IndexedFastaSequenceFile getReference() { return reference; } - - protected void cleanup() { - try { - mBeanServer.unregisterMBean(mBeanName); - } - catch (JMException ex) { - throw new ReviewedGATKException("Unable to unregister microscheduler with JMX", ex); - } - } - - /** - * Returns a traversal engine suitable for use, associated with key - * - * Key is an arbitrary object that is used to retrieve the same traversal - * engine over and over. This can be important in the case where the - * traversal engine has data associated with it in some other context, - * and we need to ensure that the context always sees the same traversal - * engine. This happens in the HierarchicalMicroScheduler, where you want - * the a thread executing traversals to retrieve the same engine each time, - * as outputs are tracked w.r.t. that engine. - * - * If no engine is associated with key yet, pops the next available engine - * from the available ones maintained by this - * microscheduler. Note that it's a runtime error to pop a traversal engine - * from this scheduler if there are none available. Callers that - * once pop'd an engine for use must return it with returnTraversalEngine - * - * @param key the key to associate with this engine - * @return a non-null TraversalEngine suitable for execution in this scheduler - */ - @Ensures("result != null") - protected synchronized TraversalEngine borrowTraversalEngine(final Object key) { - if ( key == null ) throw new IllegalArgumentException("key cannot be null"); - - final TraversalEngine engine = allocatedTraversalEngines.get(key); - if ( engine == null ) { - if ( availableTraversalEngines.isEmpty() ) - throw new IllegalStateException("no traversal engines were available"); - allocatedTraversalEngines.put(key, availableTraversalEngines.pop()); - return allocatedTraversalEngines.get(key); - } else { - return engine; - } - } - - /** - * Return a borrowed traversal engine to this MicroScheduler, for later use - * in another traversal execution - * - * @param key the key used to id the engine, provided to the borrowTraversalEngine function - * @param traversalEngine the borrowed traversal engine. Must have been previously borrowed. - */ - protected synchronized void returnTraversalEngine(final Object key, final TraversalEngine traversalEngine) { - if ( traversalEngine == null ) - throw new IllegalArgumentException("Attempting to push a null traversal engine"); - if ( ! allCreatedTraversalEngines.contains(traversalEngine) ) - throw new IllegalArgumentException("Attempting to push a traversal engine not created by this MicroScheduler" + engine); - if ( ! allocatedTraversalEngines.containsKey(key) ) - throw new IllegalArgumentException("No traversal engine was never checked out with key " + key); - - // note there's nothing to actually do here, but a function implementation - // might want to do something - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/WindowMaker.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/WindowMaker.java deleted file mode 100644 index c8483298b..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/WindowMaker.java +++ /dev/null @@ -1,217 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.executive; - -import htsjdk.samtools.util.PeekableIterator; -import org.broadinstitute.gatk.engine.ReadProperties; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.datasources.reads.Shard; -import org.broadinstitute.gatk.engine.iterators.GATKSAMRecordIterator; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.locusiterator.LocusIterator; -import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.Collection; -import java.util.Iterator; -import java.util.List; -import java.util.NoSuchElementException; - -/** - * Transforms an iterator of reads which overlap the given interval list into an iterator of covered single-base loci - * completely contained within the interval list. To do this, it creates a LocusIteratorByState which will emit a single-bp - * locus for every base covered by the read iterator, then uses the WindowMakerIterator.advance() to filter down that stream of - * loci to only those covered by the given interval list. - * - * Example: - * Incoming stream of reads: A:chr20:1-5, B:chr20:2-6, C:chr20:2-7, D:chr20:3-8, E:chr20:5-10 - * Incoming intervals: chr20:3-7 - * - * Locus iterator by state will produce the following stream of data: - * chr1:1 {A}, chr1:2 {A,B,C}, chr1:3 {A,B,C,D}, chr1:4 {A,B,C,D}, chr1:5 {A,B,C,D,E}, - * chr1:6 {B,C,D,E}, chr1:7 {C,D,E}, chr1:8 {D,E}, chr1:9 {E}, chr1:10 {E} - * - * WindowMakerIterator will then filter the incoming stream, emitting the following stream: - * chr1:3 {A,B,C,D}, chr1:4 {A,B,C,D}, chr1:5 {A,B,C,D,E}, chr1:6 {B,C,D,E}, chr1:7 {C,D,E} - * - * @author mhanna - * @version 0.1 - */ -public class WindowMaker implements Iterable, Iterator { - /** - * Source information for iteration. - */ - private final ReadProperties sourceInfo; - - /** - * Hold the read iterator so that it can be closed later. - */ - private final GATKSAMRecordIterator readIterator; - - /** - * The data source for reads. Will probably come directly from the BAM file. - */ - private final PeekableIterator sourceIterator; - - /** - * Stores the sequence of intervals that the windowmaker should be tracking. - */ - private final PeekableIterator intervalIterator; - - /** - * In the case of monolithic sharding, this case returns whether the only shard has been generated. - */ - private boolean shardGenerated = false; - - /** - * The alignment context to return from this shard's iterator. Lazy implementation: the iterator will not find the - * currentAlignmentContext until absolutely required to do so. If currentAlignmentContext is null and advance() - * doesn't populate it, no more elements are available. If currentAlignmentContext is non-null, currentAlignmentContext - * should be returned by next(). - */ - private AlignmentContext currentAlignmentContext; - - /** - * Create a new window maker with the given iterator as a data source, covering - * the given intervals. - * @param iterator The data source for this window. - * @param intervals The set of intervals over which to traverse. - * @param sampleNames The complete set of sample names in the reads in shard - */ - - private final LocusIteratorByState libs; - - public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, GATKSAMIterator iterator, List intervals, Collection sampleNames) { - this.sourceInfo = shard.getReadProperties(); - this.readIterator = new GATKSAMRecordIterator(iterator); - - this.libs = new LocusIteratorByState(readIterator,sourceInfo,genomeLocParser,sampleNames); - this.sourceIterator = new PeekableIterator(libs); - - this.intervalIterator = intervals.size()>0 ? new PeekableIterator(intervals.iterator()) : null; - } - - public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, GATKSAMIterator iterator, List intervals ) { - this(shard, genomeLocParser, iterator, intervals, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); - } - - public Iterator iterator() { - return this; - } - - public boolean hasNext() { - return (intervalIterator != null && intervalIterator.hasNext()) || !shardGenerated; - } - - public WindowMakerIterator next() { - shardGenerated = true; - return new WindowMakerIterator(intervalIterator != null ? intervalIterator.next() : null); - } - - public void remove() { - throw new UnsupportedOperationException("Cannot remove from a window maker."); - } - - public void close() { - this.readIterator.close(); - } - - public class WindowMakerIterator extends LocusIterator { - /** - * The locus for which this iterator is currently returning reads. - */ - private final GenomeLoc locus; - - public WindowMakerIterator(GenomeLoc locus) { - this.locus = locus; - advance(); - } - - public ReadProperties getSourceInfo() { - return sourceInfo; - } - - public GenomeLoc getLocus() { - return locus; - } - - public WindowMakerIterator iterator() { - return this; - } - - public boolean hasNext() { - advance(); - return currentAlignmentContext != null; - } - - public AlignmentContext next() { - if(!hasNext()) throw new NoSuchElementException("WindowMakerIterator is out of elements for this interval."); - - // Consume this alignment context. - AlignmentContext toReturn = currentAlignmentContext; - currentAlignmentContext = null; - - // Return the current element. - return toReturn; - } - - private void advance() { - // Need to find the next element that is not past shard boundaries. If we travel past the edge of - // shard boundaries, stop and let the next interval pick it up. - while(currentAlignmentContext == null && sourceIterator.hasNext()) { - // Advance the iterator and try again. - AlignmentContext candidateAlignmentContext = sourceIterator.peek(); - - if(locus == null) { - // No filter present. Return everything that LocusIteratorByState provides us. - currentAlignmentContext = sourceIterator.next(); - } - else if(locus.isPast(candidateAlignmentContext.getLocation())) - // Found a locus before the current window; claim this alignment context and throw it away. - sourceIterator.next(); - else if(locus.containsP(candidateAlignmentContext.getLocation())) { - // Found a locus within the current window; claim this alignment context and call it the next entry. - currentAlignmentContext = sourceIterator.next(); - } - else if(locus.isBefore(candidateAlignmentContext.getLocation())) { - // Whoops. Skipped passed the end of the region. Iteration for this window is complete. Do - // not claim this alignment context in case it is part of the next shard. - break; - } - else - throw new ReviewedGATKException("BUG: filtering locus does not contain, is not before, and is not past the given alignment context"); - } - } - - @Override - public LocusIteratorByState getLIBS() { - return libs; - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/BadCigarFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/BadCigarFilter.java deleted file mode 100644 index fce3a714d..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/BadCigarFilter.java +++ /dev/null @@ -1,122 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.Cigar; -import htsjdk.samtools.CigarElement; -import htsjdk.samtools.CigarOperator; -import htsjdk.samtools.SAMRecord; - -import java.util.Iterator; - -/** - * Filter out reads with wonky cigar strings. - * - * - No reads with Hard/Soft clips in the middle of the cigar - * - No reads starting with deletions (with or without preceding clips) - * - No reads ending in deletions (with or without follow-up clips) - * - No reads that are fully hard or soft clipped - * - No reads that have consecutive indels in the cigar (II, DD, ID or DI) - * - * ps: apparently an empty cigar is okay... - * - * @author ebanks - * @version 0.1 - */ - -public class BadCigarFilter extends ReadFilter { - - public boolean filterOut(final SAMRecord rec) { - final Cigar c = rec.getCigar(); - - // if there is no Cigar then it can't be bad - if( c.isEmpty() ) { - return false; - } - - Iterator elementIterator = c.getCigarElements().iterator(); - - CigarOperator firstOp = CigarOperator.H; - while (elementIterator.hasNext() && (firstOp == CigarOperator.H || firstOp == CigarOperator.S)) { - CigarOperator op = elementIterator.next().getOperator(); - - // No reads with Hard/Soft clips in the middle of the cigar - if (firstOp != CigarOperator.H && op == CigarOperator.H) { - return true; - } - firstOp = op; - } - - // No reads starting with deletions (with or without preceding clips) - if (firstOp == CigarOperator.D) { - return true; - } - - boolean hasMeaningfulElements = (firstOp != CigarOperator.H && firstOp != CigarOperator.S); - boolean previousElementWasIndel = firstOp == CigarOperator.I; - CigarOperator lastOp = firstOp; - CigarOperator previousOp = firstOp; - - while (elementIterator.hasNext()) { - CigarOperator op = elementIterator.next().getOperator(); - - if (op != CigarOperator.S && op != CigarOperator.H) { - - // No reads with Hard/Soft clips in the middle of the cigar - if (previousOp == CigarOperator.S || previousOp == CigarOperator.H) - return true; - - lastOp = op; - - if (!hasMeaningfulElements && op.consumesReadBases()) { - hasMeaningfulElements = true; - } - - if (op == CigarOperator.I || op == CigarOperator.D) { - - // No reads that have consecutive indels in the cigar (II, DD, ID or DI) - if (previousElementWasIndel) { - return true; - } - previousElementWasIndel = true; - } - else { - previousElementWasIndel = false; - } - } - // No reads with Hard/Soft clips in the middle of the cigar - else if (op == CigarOperator.S && previousOp == CigarOperator.H) { - return true; - } - - previousOp = op; - } - - // No reads ending in deletions (with or without follow-up clips) - // No reads that are fully hard or soft clipped - return lastOp == CigarOperator.D || !hasMeaningfulElements; - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/BadMateFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/BadMateFilter.java deleted file mode 100644 index c25d8d9ca..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/BadMateFilter.java +++ /dev/null @@ -1,47 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMRecord; - -/** - * Filter out reads whose mate maps to a different contig. - * - * @author ebanks - * @version 0.1 - */ - -public class BadMateFilter extends ReadFilter { - - public boolean filterOut(final SAMRecord rec) { - return hasBadMate(rec); - } - - public static boolean hasBadMate(final SAMRecord rec) { - return (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag() && !rec.getReferenceIndex().equals(rec.getMateReferenceIndex())); - } - -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/DuplicateReadFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/DuplicateReadFilter.java deleted file mode 100644 index 52861e257..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/DuplicateReadFilter.java +++ /dev/null @@ -1,66 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMRecord; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Filter out duplicate reads. - * - * @author rpoplin - * @since Dec 9, 2009 - */ - -public class DuplicateReadFilter extends ReadFilter { - public boolean filterOut( final SAMRecord read ) { - return read.getDuplicateReadFlag(); - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/FailsVendorQualityCheckFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/FailsVendorQualityCheckFilter.java deleted file mode 100644 index 2cc5e2a8b..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/FailsVendorQualityCheckFilter.java +++ /dev/null @@ -1,41 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMRecord; - -/** - * Filter out reads that fail the vendor quality check. - * - * @author rpoplin - * @since Jul 19, 2010 - */ - -public class FailsVendorQualityCheckFilter extends ReadFilter { - public boolean filterOut( final SAMRecord read ) { - return read.getReadFailsVendorQualityCheckFlag(); - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/FilterManager.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/FilterManager.java deleted file mode 100644 index 59c3f151b..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/FilterManager.java +++ /dev/null @@ -1,95 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import org.broadinstitute.gatk.utils.classloader.PluginManager; -import org.broadinstitute.gatk.utils.help.GATKDocUtils; -import org.broadinstitute.gatk.utils.help.HelpConstants; - -import java.util.Collection; -import java.util.List; - -/** - * Manage filters and filter options. Any requests for basic filtering classes - * should ultimately be made through this class. - * - * @author mhanna - * @version 0.1 - */ -public class FilterManager extends PluginManager { - public FilterManager() { - super(ReadFilter.class,"filter","Filter"); - } - - /** - * Instantiate a filter of the given type. Along the way, scream bloody murder if - * the filter is not available. - * @param filterType The type of the filter - * @return The filter - */ - public ReadFilter createFilterByType(Class filterType) { - return this.createByName(getName(filterType)); - } - - public Collection> getValues() { - return this.getPlugins(); - } - - /** - * Rather than use the default error message, print out a list of read filters as well. - * @param pluginCategory - string, the category of the plugin (e.g. read filter) - * @param pluginName - string, what we were trying to match (but failed to) - * @return - A wall of text with the default message, followed by a listing of available read filters - */ - @Override - protected String formatErrorMessage(String pluginCategory, String pluginName) { - List> availableFilters = this.getPluginsImplementing(ReadFilter.class); - - - return String.format("Read filter %s not found. Available read filters:%n%n%s%n%n%s",pluginName, - userFriendlyListofReadFilters(availableFilters), - "Please consult the GATK Documentation (" + HelpConstants.GATK_DOCS_URL + ") for more information."); - } - - private String userFriendlyListofReadFilters(List> filters) { - final String headName = "FilterName", headDoc = "Documentation"; - int longestNameLength = -1; - for ( Class < ? extends ReadFilter> filter : filters ) { - longestNameLength = Math.max(longestNameLength,this.getName(filter).length()); - } - String format = " %"+longestNameLength+"s %s%n"; - - StringBuilder listBuilder = new StringBuilder(); - listBuilder.append(String.format(format,headName,headDoc)); - for ( Class filter : filters ) { - String helpLink = GATKDocUtils.helpLinksToGATKDocs(filter); - String filterName = this.getName(filter); - listBuilder.append(String.format(format,filterName,helpLink)); - } - - return listBuilder.toString(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/LibraryReadFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/LibraryReadFilter.java deleted file mode 100644 index 8b0f07624..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/LibraryReadFilter.java +++ /dev/null @@ -1,49 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMReadGroupRecord; -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.utils.commandline.Argument; -import org.broadinstitute.gatk.engine.filters.ReadFilter; - -/** - * Only use reads from the specified library - * - * @author kcibul - * @since Aug 15, 2012 - * - */ - -public class LibraryReadFilter extends ReadFilter { - @Argument(fullName = "library", shortName = "library", doc="The name of the library to keep, filtering out all others", required=true) - private String LIBRARY_TO_KEEP = null; - - public boolean filterOut( final SAMRecord read ) { - final SAMReadGroupRecord readGroup = read.getReadGroup(); - return ( readGroup == null || readGroup.getLibrary() == null || !readGroup.getLibrary().equals( LIBRARY_TO_KEEP ) ); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilter.java deleted file mode 100644 index 1b59a06d8..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilter.java +++ /dev/null @@ -1,260 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.*; -import org.broadinstitute.gatk.utils.commandline.Argument; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.ReadProperties; -import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; -import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.util.Collections; - -/** - * Filter out malformed reads. - * - * @author mhanna - * @version 0.1 - */ -public class MalformedReadFilter extends ReadFilter { - - - private static final String FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME = "filter_reads_with_N_cigar" ; - - private SAMFileHeader header; - - @Argument(fullName = FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME, shortName = "filterRNC", doc = "filter out reads with CIGAR containing the N operator, instead of stop processing and report an error.", required = false) - boolean filterReadsWithNCigar = false; - - - @Argument(fullName = "filter_mismatching_base_and_quals", shortName = "filterMBQ", doc = "if a read has mismatching number of bases and base qualities, filter out the read instead of blowing up.", required = false) - boolean filterMismatchingBaseAndQuals = false; - - @Argument(fullName = "filter_bases_not_stored", shortName = "filterNoBases", doc = "if a read has no stored bases (i.e. a '*'), filter out the read instead of blowing up.", required = false) - boolean filterBasesNotStored = false; - - /** - * Indicates the applicable validation exclusions - */ - private boolean allowNCigars; - - @Override - public void initialize(final GenomeAnalysisEngine engine) { - header = engine.getSAMFileHeader(); - ValidationExclusion validationExclusions = null; - final SAMDataSource rds = engine.getReadsDataSource(); - if (rds != null) { - final ReadProperties rps = rds.getReadsInfo(); - if (rps != null) { - validationExclusions = rps.getValidationExclusionList(); - } - } - if (validationExclusions == null) { - allowNCigars = false; - } else { - allowNCigars = validationExclusions.contains(ValidationExclusion.TYPE.ALLOW_N_CIGAR_READS); - } - } - - public boolean filterOut(final SAMRecord read) { - // slowly changing the behavior to blow up first and filtering out if a parameter is explicitly provided - return !checkInvalidAlignmentStart(read) || - !checkInvalidAlignmentEnd(read) || - !checkAlignmentDisagreesWithHeader(this.header,read) || - !checkHasReadGroup(read) || - !checkMismatchingBasesAndQuals(read, filterMismatchingBaseAndQuals) || - !checkCigarDisagreesWithAlignment(read) || - !checkSeqStored(read, filterBasesNotStored) || - !checkCigarIsSupported(read,filterReadsWithNCigar,allowNCigars); - } - - private static boolean checkHasReadGroup(final SAMRecord read) { - if ( read.getReadGroup() == null ) { - // there are 2 possibilities: either the RG tag is missing or it is not defined in the header - final String rgID = (String)read.getAttribute(SAMTagUtil.getSingleton().RG); - if ( rgID == null ) - throw new UserException.ReadMissingReadGroup(read); - throw new UserException.ReadHasUndefinedReadGroup(read, rgID); - } - return true; - } - - /** - * Check for the case in which the alignment start is inconsistent with the read unmapped flag. - * @param read The read to validate. - * @return true if read start is valid, false otherwise. - */ - private static boolean checkInvalidAlignmentStart(final SAMRecord read ) { - // read is not flagged as 'unmapped', but alignment start is NO_ALIGNMENT_START - if( !read.getReadUnmappedFlag() && read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START ) - return false; - // Read is not flagged as 'unmapped', but alignment start is -1 - if( !read.getReadUnmappedFlag() && read.getAlignmentStart() == -1 ) - return false; - return true; - } - - /** - * Check for invalid end of alignments. - * @param read The read to validate. - * @return true if read end is valid, false otherwise. - */ - private static boolean checkInvalidAlignmentEnd(final SAMRecord read ) { - // Alignment aligns to negative number of bases in the reference. - if( !read.getReadUnmappedFlag() && read.getAlignmentEnd() != -1 && (read.getAlignmentEnd()-read.getAlignmentStart()+1)<0 ) - return false; - return true; - } - - /** - * Check to ensure that the alignment makes sense based on the contents of the header. - * @param header The SAM file header. - * @param read The read to verify. - * @return true if alignment agrees with header, false othrewise. - */ - private static boolean checkAlignmentDisagreesWithHeader(final SAMFileHeader header, final SAMRecord read ) { - // Read is aligned to nonexistent contig - if( read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) - return false; - final SAMSequenceRecord contigHeader = header.getSequence( read.getReferenceIndex() ); - // Read is aligned to a point after the end of the contig - if( !read.getReadUnmappedFlag() && read.getAlignmentStart() > contigHeader.getSequenceLength() ) - return false; - return true; - } - - /** - * Check for inconsistencies between the cigar string and the - * @param read The read to validate. - * @return true if cigar agrees with alignment, false otherwise. - */ - private static boolean checkCigarDisagreesWithAlignment(final SAMRecord read) { - // Read has a valid alignment start, but the CIGAR string is empty - if( !read.getReadUnmappedFlag() && - read.getAlignmentStart() != -1 && - read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START && - read.getAlignmentBlocks().size() < 0 ) - return false; - return true; - } - - /** - * Check for unsupported CIGAR operators. - * Currently the N operator is not supported. - * @param read The read to validate. - * @param filterReadsWithNCigar whether the offending read should just - * be silently filtered or not. - * @param allowNCigars whether reads that contain N operators in their CIGARs - * can be processed or an exception should be thrown instead. - * @throws UserException.UnsupportedCigarOperatorException - * if {@link #filterReadsWithNCigar} is false and - * the input read has some unsupported operation. - * @return true if the read CIGAR operations are - * fully supported, otherwise false, as long as - * no exception has been thrown. - */ - private static boolean checkCigarIsSupported(final SAMRecord read, final boolean filterReadsWithNCigar, final boolean allowNCigars) { - if( containsNOperator(read)) { - if (! filterReadsWithNCigar && !allowNCigars) { - throw new UserException.UnsupportedCigarOperatorException( - CigarOperator.N,read, - "Perhaps you are" - + " trying to use RNA-Seq data?" - + " While we are currently actively working to" - + " support this data type unfortunately the" - + " GATK cannot be used with this data in its" - + " current form. You have the option of either" - + " filtering out all reads with operator " - + CigarOperator.N + " in their CIGAR string" - + " (please add --" - + FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME - + " to your command line) or" - + " assume the risk of processing those reads as they" - + " are including the pertinent unsafe flag (please add -U" - + ' ' + ValidationExclusion.TYPE.ALLOW_N_CIGAR_READS - + " to your command line). Notice however that if you were" - + " to choose the latter, an unspecified subset of the" - + " analytical outputs of an unspecified subset of the tools" - + " will become unpredictable. Consequently the GATK team" - + " might well not be able to provide you with the usual support" - + " with any issue regarding any output"); - } - return ! filterReadsWithNCigar; - } - return true; - } - - private static boolean containsNOperator(final SAMRecord read) { - final Cigar cigar = read.getCigar(); - if (cigar == null) { - return false; - } - for (final CigarElement ce : cigar.getCigarElements()) { - if (ce.getOperator() == CigarOperator.N) { - return true; - } - } - return false; - } - - /** - * Check if the read has the same number of bases and base qualities - * @param read the read to validate - * @return true if they have the same number. False otherwise. - */ - private static boolean checkMismatchingBasesAndQuals(final SAMRecord read, final boolean filterMismatchingBaseAndQuals) { - final boolean result; - if (read.getReadLength() == read.getBaseQualities().length) - result = true; - else if (filterMismatchingBaseAndQuals) - result = false; - else - throw new UserException.MalformedBAM(read, - String.format("BAM file has a read with mismatching number of bases and base qualities. Offender: %s [%d bases] [%d quals].%s", - read.getReadName(), read.getReadLength(), read.getBaseQualities().length, - read.getBaseQualities().length == 0 ? " You can use --defaultBaseQualities to assign a default base quality for all reads, but this can be dangerous in you don't know what you are doing." : "")); - - return result; - } - - /** - * Check if the read has its base sequence stored - * @param read the read to validate - * @return true if the sequence is stored and false otherwise ("*" in the SEQ field). - */ - protected static boolean checkSeqStored(final SAMRecord read, final boolean filterBasesNotStored) { - - if ( read.getReadBases() != SAMRecord.NULL_SEQUENCE ) - return true; - - if ( filterBasesNotStored ) - return false; - - throw new UserException.MalformedBAM(read, String.format("the BAM file has a read with no stored bases (i.e. it uses '*') which is not supported in the GATK; see the --filter_bases_not_stored argument. Offender: %s", read.getReadName())); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityFilter.java deleted file mode 100644 index 67c62b975..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityFilter.java +++ /dev/null @@ -1,46 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.utils.commandline.Argument; - -/** - * Filter out reads with low mapping qualities. - * - * @author ebanks - * @version 0.1 - */ - -public class MappingQualityFilter extends ReadFilter { - - @Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling", required = false) - public int MIN_MAPPING_QUALTY_SCORE = 10; - - public boolean filterOut(SAMRecord rec) { - return (rec.getMappingQuality() < MIN_MAPPING_QUALTY_SCORE); - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityUnavailableFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityUnavailableFilter.java deleted file mode 100644 index 05df7fb0d..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityUnavailableFilter.java +++ /dev/null @@ -1,43 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.utils.QualityUtils; - -/** - * Filter out mapping quality zero reads. - * - * @author ebanks - * @version 0.1 - */ - -public class MappingQualityUnavailableFilter extends ReadFilter { - public boolean filterOut(SAMRecord rec) { - return (rec.getMappingQuality() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE); - } -} - diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityZeroFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityZeroFilter.java deleted file mode 100644 index f3f703278..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityZeroFilter.java +++ /dev/null @@ -1,42 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMRecord; - -/** - * Filter out mapping quality zero reads. - * - * @author hanna - * @version 0.1 - */ - -public class MappingQualityZeroFilter extends ReadFilter { - public boolean filterOut(SAMRecord rec) { - return (rec.getMappingQuality() == 0); - } -} - diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MateSameStrandFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MateSameStrandFilter.java deleted file mode 100644 index 0818f8fa0..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MateSameStrandFilter.java +++ /dev/null @@ -1,42 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMRecord; - -/** - * Filter out reads that are not paired, have their mate unmapped, are duplicates, fail vendor quality check or both mate and read are in the same strand. - * - * @author chartl - * @since 5/18/11 - */ -public class MateSameStrandFilter extends ReadFilter { - - public boolean filterOut(SAMRecord read) { - return (! read.getReadPairedFlag() ) || read.getMateUnmappedFlag() || read.getDuplicateReadFlag() || - read.getReadFailsVendorQualityCheckFlag() || read.getMateNegativeStrandFlag() != read.getReadNegativeStrandFlag(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MaxInsertSizeFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MaxInsertSizeFilter.java deleted file mode 100644 index cca05ebc7..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MaxInsertSizeFilter.java +++ /dev/null @@ -1,44 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.utils.commandline.Argument; - -/** - * Filter out reads that exceed a given max insert size - * - * @author chartl - * @since 5/2/11 - */ -public class MaxInsertSizeFilter extends ReadFilter { - @Argument(fullName = "maxInsertSize", shortName = "maxInsert", doc="Discard reads with insert size greater than the specified value, defaults to 1000000", required=false) - private int maxInsertSize = 1000000; - - public boolean filterOut(SAMRecord record) { - return (record.getReadPairedFlag() && (record.getInferredInsertSize() > maxInsertSize || record.getInferredInsertSize() < -1*maxInsertSize)); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MissingReadGroupFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MissingReadGroupFilter.java deleted file mode 100644 index 21b291bb3..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MissingReadGroupFilter.java +++ /dev/null @@ -1,41 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMRecord; - -/** - * Filter out reads without read groups. - * - * @author ebanks - * @version 0.1 - */ - -public class MissingReadGroupFilter extends ReadFilter { - public boolean filterOut(SAMRecord rec) { - return rec.getReadGroup() == null; - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformer.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformer.java deleted file mode 100644 index 65bf1eb02..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformer.java +++ /dev/null @@ -1,118 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.Cigar; -import htsjdk.samtools.CigarElement; -import htsjdk.samtools.CigarOperator; -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.iterators.RNAReadTransformer; -import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -/** - * A read transformer that refactor NDN cigar elements to one N element. - * - *

    - * This read transformer will refactor cigar strings that contain N-D-N elements to one N element (with total length of the three refactored elements). - * This is intended primarily for users of RNA-Seq data handling programs such as TopHat2. - * Currently we consider that the internal N-D-N motif is illegal and we error out when we encounter it. By refactoring the cigar string of - * those specific reads, users of TopHat and other tools can circumvent this problem without affecting the rest of their dataset. - * - * NOTE: any walker that need that functionality should apply that read transformer in its map function, since it won't be activated by the GATK engine. - * - * The engine parameter that activate this read transformer is --refactor_NDN_cigar_string or -fixNDN - *

    - * - * - * - * @author ami - * @since 04/22/14 - */ - -public class NDNCigarReadTransformer extends RNAReadTransformer { - - private boolean refactorReads; - - @Override - public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { - refactorReads = engine.getArguments().REFACTOR_NDN_CIGAR_READS; - - return ApplicationTime.HANDLED_IN_WALKER; // NOTE: any walker that need that functionality should apply that read transformer in its map function, since it won't be activated by the GATK engine. - } - - @Override - public GATKSAMRecord apply(final GATKSAMRecord read) { - if(read == null) - throw new UserException.BadInput("try to transform a null GATKSAMRecord"); - final Cigar originalCigar = read.getCigar(); - if (originalCigar.isValid(read.getReadName(),-1) != null) - throw new UserException.BadInput("try to transform a read with non-valid cigar string: readName: "+read.getReadName()+" Cigar String: "+originalCigar); - read.setCigar(refactorNDNtoN(originalCigar)); - return read; - } - - @Override - public boolean enabled() { - return refactorReads; - } - - - - protected Cigar refactorNDNtoN(final Cigar originalCigar) { - final Cigar refactoredCigar = new Cigar(); - final int cigarLength = originalCigar.numCigarElements(); - for(int i = 0; i < cigarLength; i++){ - final CigarElement element = originalCigar.getCigarElement(i); - if(element.getOperator() == CigarOperator.N && thereAreAtLeast2MoreElements(i,cigarLength)){ - final CigarElement nextElement = originalCigar.getCigarElement(i+1); - final CigarElement nextNextElement = originalCigar.getCigarElement(i+2); - - // if it is N-D-N replace with N (with the total length) otherwise just add the first N. - if(nextElement.getOperator() == CigarOperator.D && nextNextElement.getOperator() == CigarOperator.N){ - final int threeElementsLength = element.getLength() + nextElement.getLength() + nextNextElement.getLength(); - final CigarElement refactoredElement = new CigarElement(threeElementsLength,CigarOperator.N); - refactoredCigar.add(refactoredElement); - i += 2; //skip the elements that were refactored - } - else - refactoredCigar.add(element); // add only the first N - } - else - refactoredCigar.add(element); // add any non-N element - } - return refactoredCigar; - } - - private boolean thereAreAtLeast2MoreElements(final int index, final int cigarLength){ - return index < cigarLength - 2; - } - -} - diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/NoOriginalQualityScoresFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/NoOriginalQualityScoresFilter.java deleted file mode 100644 index 8297903d8..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/NoOriginalQualityScoresFilter.java +++ /dev/null @@ -1,65 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMRecord; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Filter out reads that don't have base an original quality quality score tag (usually added by BQSR) - * - * @author rpoplin - * @since Nov 19, 2009 - */ -public class NoOriginalQualityScoresFilter extends ReadFilter { - public boolean filterOut( final SAMRecord read ) { - return (read.getAttribute("OQ") == null); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/NotPrimaryAlignmentFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/NotPrimaryAlignmentFilter.java deleted file mode 100644 index b09e1f6d5..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/NotPrimaryAlignmentFilter.java +++ /dev/null @@ -1,41 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMRecord; - -/** - * Filter out duplicate reads. - * - * @author rpoplin - * @since Dec 9, 2009 - */ - -public class NotPrimaryAlignmentFilter extends ReadFilter { - public boolean filterOut( final SAMRecord read ) { - return read.getNotPrimaryAlignmentFlag(); - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/Platform454Filter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/Platform454Filter.java deleted file mode 100644 index 79f16a5fc..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/Platform454Filter.java +++ /dev/null @@ -1,43 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.sam.ReadUtils; - -/** - * Filter out 454 reads. - * - * @author ebanks - * @version 0.1 - */ - -public class Platform454Filter extends ReadFilter { - public boolean filterOut(SAMRecord rec) { - return (ReadUtils.is454Read((GATKSAMRecord)rec)); - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformFilter.java deleted file mode 100644 index 8236cc219..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformFilter.java +++ /dev/null @@ -1,49 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.utils.commandline.Argument; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.sam.ReadUtils; - -/** - * Filter out PL matching reads. - * - * @author ebanks - * @version 0.1 - */ -public class PlatformFilter extends ReadFilter { - @Argument(fullName = "PLFilterName", shortName = "PLFilterName", doc="Discard reads with RG:PL attribute containing this string", required=false) - protected String[] PLFilterNames; - - public boolean filterOut(SAMRecord rec) { - for ( String name : PLFilterNames ) - if ( ReadUtils.isPlatformRead((GATKSAMRecord)rec, name.toUpperCase() )) - return true; - return false; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilter.java deleted file mode 100644 index 4a6781ff5..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilter.java +++ /dev/null @@ -1,86 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMReadGroupRecord; -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.util.HashSet; -import java.util.Set; - -/** - * Filter out reads that have blacklisted platform unit tags. (See code documentation for how to create the blacklist). - * - * @author asivache - * @since Sep 21, 2009 - */ -public class PlatformUnitFilter extends ReadFilter { - // a hack: use static in order to be able to fill it with the data from command line at runtime - static private Set blackListedLanes = new HashSet(); - - public boolean filterOut(SAMRecord samRecord) { - - if ( blackListedLanes.size() == 0 ) return false; // no filters set, nothing to do - - Object pu_attr = samRecord.getAttribute("PU"); - - if ( pu_attr == null ) { - // no platform unit in the record, go get from read group - SAMReadGroupRecord rgr = samRecord.getReadGroup(); - if ( rgr == null ) throw new UserException.MalformedBAM(samRecord, "Read " + samRecord.getReadName() +" has NO associated read group record"); - pu_attr = rgr.getAttribute("PU") ; - } - if ( pu_attr == null ) return false; // could not get PU, forget about the filtering... - return blackListedLanes.contains((String)pu_attr); - } - - /** - * The argument is interpreted as a comma-separated list of lanes (platform units) to be filtered - * out. All the specified names will be registered with the filter and filterOut(r) for any SAMRecord r - * belonging to one of the specified lanes will thereafter return true. - * The names can be surrounded by additional spaces, the latters will be trimmed by this method. - * This method can be called multiple times to add more lanes. Re-registering the same lane again is safe. - * @param arg - */ - public static void setBlackListedLanes(String arg) { - String[] lanes = arg.split(","); - for ( int i = 0; i < lanes.length ; i++ ) { - blackListedLanes.add(lanes[i].trim()); - } - } - - /** - * Adds a single name of a lane (platform unit) to be filtered out by this filter. The name can be surrounded - * by spaces, the latters will be trimmed out. This method can be called multiple times to add more lanes. - * Re-registering the same lane again is safe. - * @param arg - */ - public static void addBlackListedLane(String arg) { - blackListedLanes.add(arg.trim()); - } - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilter.java deleted file mode 100644 index 7c6bfb0e3..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilter.java +++ /dev/null @@ -1,120 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMReadGroupRecord; -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.text.XReadLines; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.*; -import java.util.Map.Entry; - -/** - * Removes records matching the read group tag and exact match string. - * For example, this filter value: - * PU:1000G-mpimg-080821-1_1 - * would filter out a read with the read group PU:1000G-mpimg-080821-1_1 - */ -public class ReadGroupBlackListFilter extends ReadFilter { - private Set>> filterEntries; - - public ReadGroupBlackListFilter(List blackLists) { - Map> filters = new TreeMap>(); - for (String blackList : blackLists) - addFilter(filters, blackList, null, 0); - this.filterEntries = filters.entrySet(); - } - - public boolean filterOut(SAMRecord samRecord) { - for (Entry> filterEntry : filterEntries) { - String attributeType = filterEntry.getKey(); - - SAMReadGroupRecord samReadGroupRecord = samRecord.getReadGroup(); - if (samReadGroupRecord != null) { - Object attribute; - if ("ID".equals(attributeType) || "RG".equals(attributeType)) - attribute = samReadGroupRecord.getId(); - else - attribute = samReadGroupRecord.getAttribute(attributeType); - if (attribute != null && filterEntry.getValue().contains(attribute)) - return true; - } - } - - return false; - } - - private void addFilter(Map> filters, String filter, File parentFile, int parentLineNum) { - if (filter.toLowerCase().endsWith(".list") || filter.toLowerCase().endsWith(".txt")) { - File file = new File(filter); - try { - int lineNum = 0; - XReadLines lines = new XReadLines(file); - for (String line : lines) { - lineNum++; - - if (line.trim().length() == 0) - continue; - - if (line.startsWith("#")) - continue; - - addFilter(filters, line, file, lineNum); - } - } catch (FileNotFoundException e) { - String message = "Error loading black list: " + file.getAbsolutePath(); - if (parentFile != null) { - message += ", " + parentFile.getAbsolutePath() + ":" + parentLineNum; - } - throw new UserException(message); - } - } else { - String[] filterEntry = filter.split(":", 2); - - String message = null; - if (filterEntry.length != 2) { - message = "Invalid read group filter: " + filter; - } else if (filterEntry[0].length() != 2) { - message = "Tag is not two characters: " + filter; - } - - if (message != null) { - if (parentFile != null) { - message += ", " + parentFile.getAbsolutePath() + ":" + parentLineNum; - } - message += ", format is :"; - throw new UserException(message); - } - - if (!filters.containsKey(filterEntry[0])) - filters.put(filterEntry[0], new TreeSet()); - filters.get(filterEntry[0]).add(filterEntry[1]); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadLengthFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadLengthFilter.java deleted file mode 100644 index 1e44df806..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadLengthFilter.java +++ /dev/null @@ -1,48 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.utils.commandline.Argument; - -/** - * Filters out reads whose length is >= some value or < some value. - * - * @author mhanna - * @version 0.1 - */ -public class ReadLengthFilter extends ReadFilter { - @Argument(fullName = "maxReadLength", shortName = "maxRead", doc="Discard reads with length greater than the specified value", required=true) - private int maxReadLength; - - @Argument(fullName = "minReadLength", shortName = "minRead", doc="Discard reads with length shorter than the specified value", required=true) - private int minReadLength = 1; - public boolean filterOut(SAMRecord read) { - // check the length - return read.getReadLength() > maxReadLength || read.getReadLength() < minReadLength; - } - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadNameFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadNameFilter.java deleted file mode 100644 index 23a5151de..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadNameFilter.java +++ /dev/null @@ -1,44 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.utils.commandline.Argument; - -/** - * Filter out all reads except those with this read name - * - * @author chartl - * @since 9/19/11 - */ -public class ReadNameFilter extends ReadFilter { - @Argument(fullName = "readName", shortName = "rn", doc="Filter out all reads except those with this read name", required=true) - private String readName; - - public boolean filterOut(final SAMRecord rec) { - return ! rec.getReadName().equals(readName); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadStrandFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadStrandFilter.java deleted file mode 100644 index fd2876654..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadStrandFilter.java +++ /dev/null @@ -1,46 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.utils.commandline.Argument; - -/** - * Filters out reads whose strand is negative or positive - * - * @author chartl - * @version 0.1 - */ -public class ReadStrandFilter extends ReadFilter { - @Argument(fullName = "filterPositive", shortName = "fp", doc="Discard reads on the forward strand",required=false) - boolean filterForward = false; - - public boolean filterOut(SAMRecord read) { - // check the length - return read.getReadNegativeStrandFlag() != filterForward; - } - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignMappingQualityFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignMappingQualityFilter.java deleted file mode 100644 index 0c8a93a83..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignMappingQualityFilter.java +++ /dev/null @@ -1,86 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.utils.commandline.Argument; - -/** - * A read filter (transformer) that sets all reads mapping quality to a given value. - * - *

    - * If a BAM file contains erroneous or missing mapping qualities (MAPQ), this read transformer will set all your - * mapping qualities to a given value (see arguments list for default value). - *

    - * - *

    See also

    - * - *

    ReassignOneMappingQualityFilter: reassigns a single MAPQ value, as opposed to all those found in the BAM file.

    - * - *

    Caveats

    - * - *

    Note that due to the order of operations involved in applying filters, it is possible that other read filters - * (determined either at command-line or internally by the tool you are using) will be applied to your data before - * this read transformation can be applied. If one of those other filters acts on the read mapping quality (MAPQ), - * then you may not obtain the expected results. Unfortunately it is currently not possible to change the order of - * operations from command line. To avoid the problem, we recommend applying this filter separately from any other - * analysis, using PrintReads.

    - * - * - *

    Input

    - *

    - * BAM file(s) - *

    - * - * - *

    Output

    - *

    - * BAM file(s) with all reads mapping qualities reassigned - *

    - * - *

    Examples

    - *
    - *  java -jar GenomeAnalysisTK.jar \
    - *      -T PrintReads \
    - *      -rf ReassignMappingQuality \
    - *      -DMQ 35
    - *  
    - * - * @author carneiro - * @since 8/8/11 - */ - -public class ReassignMappingQualityFilter extends ReadFilter { - - @Argument(fullName = "default_mapping_quality", shortName = "DMQ", doc = "Default read mapping quality to assign to all reads", required = false) - public int defaultMappingQuality = 60; - - public boolean filterOut(SAMRecord rec) { - rec.setMappingQuality(defaultMappingQuality); - return false; - } -} - diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignOneMappingQualityFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignOneMappingQualityFilter.java deleted file mode 100644 index f07f197c6..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignOneMappingQualityFilter.java +++ /dev/null @@ -1,87 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.utils.commandline.Argument; - -/** - * A read filter (transformer) that changes a given read mapping quality to a different value. - * - *

    - * This read transformer will change a certain read mapping quality to a different value without affecting reads that - * have other mapping qualities. This is intended primarily for users of RNA-Seq data handling programs such - * as TopHat, which use MAPQ = 255 to designate uniquely aligned reads. According to convention, 255 normally - * designates "unknown" quality, and most GATK tools automatically ignore such reads. By reassigning a different - * mapping quality to those specific reads, users of TopHat and other tools can circumvent this problem without - * affecting the rest of their dataset. - *

    - * - *

    - * This differs from the ReassignMappingQuality filter by its selectivity -- only one mapping quality is targeted. - * ReassignMappingQuality will change ALL mapping qualities to a single one, and is typically used for datasets - * that have no assigned mapping qualities. - *

    - * - *

    Input

    - *

    - * BAM file(s) - *

    - * - * - *

    Output

    - *

    - * BAM file(s) with one read mapping quality selectively reassigned as desired - *

    - * - *

    Examples

    - *
    - *    java -jar GenomeAnalysisTK.jar
    - *      -T PrintReads
    - *      -rf ReassignOneMappingQuality
    - *      -RMQF 255
    - *      -RMQT 60
    - *  
    - * - * @author vdauwera - * @since 2/19/13 - */ - -public class ReassignOneMappingQualityFilter extends ReadFilter { - - @Argument(fullName = "reassign_mapping_quality_from", shortName = "RMQF", doc = "Original mapping quality", required = false) - public int reassignMappingQualityFrom = 255; - - @Argument(fullName = "reassign_mapping_quality_to", shortName = "RMQT", doc = "Desired mapping quality", required = false) - public int reassignMappingQualityTo = 60; - - public boolean filterOut(SAMRecord rec) { - if (rec.getMappingQuality() == reassignMappingQualityFrom) - rec.setMappingQuality(reassignMappingQualityTo); - return false; - } -} - diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/SampleFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/SampleFilter.java deleted file mode 100644 index 2ec0112ab..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/SampleFilter.java +++ /dev/null @@ -1,45 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMReadGroupRecord; -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.utils.commandline.Argument; - -import java.util.Set; - -/** - * Filter out all reads except those with this sample - */ -public class SampleFilter extends ReadFilter { - @Argument(fullName = "sample_to_keep", shortName = "goodSM", doc="The name of the sample(s) to keep, filtering out all others", required=true) - private Set SAMPLES_TO_KEEP = null; - - public boolean filterOut( final SAMRecord read ) { - final SAMReadGroupRecord readGroup = read.getReadGroup(); - return !( readGroup != null && SAMPLES_TO_KEEP.contains(readGroup.getSample()) ); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/SingleReadGroupFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/SingleReadGroupFilter.java deleted file mode 100644 index 5a9d21476..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/SingleReadGroupFilter.java +++ /dev/null @@ -1,48 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMReadGroupRecord; -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.utils.commandline.Argument; - -/** - * Only use reads from the specified read group. - * - * @author rpoplin - * @since Nov 27, 2009 - * - */ - -public class SingleReadGroupFilter extends ReadFilter { - @Argument(fullName = "read_group_to_keep", shortName = "goodRG", doc="The name of the read group to keep, filtering out all others", required=true) - private String READ_GROUP_TO_KEEP = null; - - public boolean filterOut( final SAMRecord read ) { - final SAMReadGroupRecord readGroup = read.getReadGroup(); - return !( readGroup != null && readGroup.getReadGroupId().equals( READ_GROUP_TO_KEEP ) ); - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/UnmappedReadFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/UnmappedReadFilter.java deleted file mode 100644 index e9cc30276..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/UnmappedReadFilter.java +++ /dev/null @@ -1,41 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.SAMRecord; - -/** - * Filter out unmapped reads. - * - * @author rpoplin - * @since Dec 9, 2009 - */ - -public class UnmappedReadFilter extends ReadFilter { - public boolean filterOut( final SAMRecord read ) { - return read.getReadUnmappedFlag() || read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START; - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/GATKSAMFileWriter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/GATKSAMFileWriter.java deleted file mode 100644 index c60aae842..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/GATKSAMFileWriter.java +++ /dev/null @@ -1,56 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.io; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMFileWriter; - -/** - * A writer that will allow unsorted BAM files to be written - * and sorted on-the-fly. - * - * @author mhanna - * @version 0.1 - */ -public interface GATKSAMFileWriter extends SAMFileWriter { - /** - * Writes the given custom header to SAM file output. - * @param header The header to write. - */ - public void writeHeader(SAMFileHeader header); - - /** - * Set Whether the BAM file to create is actually presorted. - * @param presorted True if the BAM file is presorted. False otherwise. - */ - public void setPresorted(boolean presorted); - - /** - * Set how many records in RAM the BAM file stores when sorting on-the-fly. - * @param maxRecordsInRam Max number of records in RAM. - */ - public void setMaxRecordsInRam(int maxRecordsInRam); -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/OutputTracker.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/OutputTracker.java deleted file mode 100644 index 8f2fbe340..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/OutputTracker.java +++ /dev/null @@ -1,178 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.io; - -import htsjdk.samtools.SAMFileReader; -import htsjdk.samtools.ValidationStringency; -import org.broadinstitute.gatk.utils.commandline.ArgumentSource; -import org.broadinstitute.gatk.engine.io.storage.Storage; -import org.broadinstitute.gatk.engine.io.storage.StorageFactory; -import org.broadinstitute.gatk.engine.io.stubs.OutputStreamStub; -import org.broadinstitute.gatk.engine.io.stubs.Stub; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.utils.classloader.JVMUtils; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.io.IOUtils; -import org.broadinstitute.gatk.utils.sam.SAMFileReaderBuilder; - -import java.io.File; -import java.io.OutputStream; -import java.lang.reflect.Field; -import java.util.HashMap; -import java.util.Map; - -/** - * Manages the output and err streams that are created specifically for walker - * output. - */ -public abstract class OutputTracker { - /** - * The streams to which walker users should be reading directly. - */ - protected Map inputs = new HashMap(); - - /** - * The streams to which walker users should be writing directly. - */ - protected Map outputs = new HashMap(); - - /** - * Special-purpose stub. Provides a connection to output streams. - */ - protected OutputStreamStub outStub = null; - - /** - * Special-purpose stream. Provides a connection to error streams. - */ - protected OutputStreamStub errStub = null; - - /** - * Gets the output storage associated with a given stub. - * @param stub The stub for which to find / create the right output stream. - * @param Type of the stream to create. - * @return Storage object with a facade of type T. - */ - public abstract T getStorage( Stub stub ); - - public void prepareWalker( Walker walker, ValidationStringency strictnessLevel ) { - for( Map.Entry io: inputs.entrySet() ) { - ArgumentSource targetField = io.getKey(); - Object targetValue = io.getValue(); - - // Ghastly hack: reaches in and finishes building out the SAMFileReader. - // TODO: Generalize this, and move it to its own initialization step. - if( targetValue instanceof SAMFileReaderBuilder) { - SAMFileReaderBuilder builder = (SAMFileReaderBuilder)targetValue; - builder.setValidationStringency(strictnessLevel); - targetValue = builder.build(); - } - - JVMUtils.setFieldValue( targetField.field, walker, targetValue ); - } - } - - /** - * Provide a mechanism for injecting supplemental streams for external management. - * @param argumentSource source Class / field into which to inject this stream. - * @param stub Stream to manage. - */ - public void addInput( ArgumentSource argumentSource, Object stub ) { - inputs.put(argumentSource,stub); - } - - /** - * Provide a mechanism for injecting supplemental streams for external management. - * @param stub Stream to manage. - */ - public void addOutput(Stub stub) { - addOutput(stub,null); - } - - /** - * Provide a mechanism for injecting supplemental streams for external management. - * @param stub Stream to manage. - */ - public void addOutput(Stub stub, Storage storage) { - stub.register(this); - outputs.put(stub,storage); - validateOutputPath(stub); - } - - /** - * Close down all existing output streams. - */ - public void close() { - for( Stub stub: outputs.keySet() ) { - // If the stream hasn't yet been created, create it so that there's at least an empty file present. - if( outputs.get(stub) == null ) - getTargetStream(stub); - - // Close down the storage. - outputs.get(stub).close(); - } - } - - /** - * Collects the target stream for this data. - * @param stub The stub for this stream. - * @param type of stub. - * @return An instantiated file into which data can be written. - */ - protected T getTargetStream( Stub stub ) { - if( !outputs.containsKey(stub) ) - throw new ReviewedGATKException("OutputTracker was not notified that this stub exists: " + stub); - Storage storage = outputs.get(stub); - if( storage == null ) { - storage = StorageFactory.createStorage(stub); - outputs.put(stub,storage); - } - return (T)storage; - } - - /** - * Ensures that the File associated with this stub (if any) is in a writable location - * @param stub - */ - protected void validateOutputPath(final Stub stub) { - if (stub.getOutputFile() != null && !(IOUtils.isSpecialFile(stub.getOutputFile()))) { - final File parentDir = stub.getOutputFile().getAbsoluteFile().getParentFile(); - if (! (parentDir.canWrite() && parentDir.canExecute())) - throw new UserException.CouldNotCreateOutputFile(stub.getOutputFile(), - "either the containing directory doesn't exist or it isn't writable"); - } - } - - /** - * Install an OutputStreamStub into the given fieldName of the given walker. - * @param walker Walker into which to inject the field name. - * @param fieldName Name of the field into which to inject the stub. - */ - private void installStub( Walker walker, String fieldName, OutputStream outputStream ) { - Field field = JVMUtils.findField( walker.getClass(), fieldName ); - JVMUtils.setFieldValue( field, walker, outputStream ); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/storage/SAMFileWriterStorage.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/storage/SAMFileWriterStorage.java deleted file mode 100644 index 3956e6e0b..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/storage/SAMFileWriterStorage.java +++ /dev/null @@ -1,157 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.io.storage; - -import htsjdk.samtools.*; -import htsjdk.samtools.util.CloseableIterator; -import htsjdk.samtools.util.ProgressLoggerInterface; -import htsjdk.samtools.util.RuntimeIOException; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.io.stubs.SAMFileWriterStub; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.sam.SimplifyingSAMFileWriter; - -import java.io.File; -import java.lang.reflect.InvocationTargetException; -import java.lang.reflect.Method; - -/** - * Provides temporary storage for SAMFileWriters. - * - * @author mhanna - * @version 0.1 - */ -public class SAMFileWriterStorage implements SAMFileWriter, Storage { - private final File file; - private SAMFileWriter writer; - - private static Logger logger = Logger.getLogger(SAMFileWriterStorage.class); - - public SAMFileWriterStorage( SAMFileWriterStub stub ) { - this(stub,stub.getOutputFile()); - } - - public SAMFileWriterStorage( SAMFileWriterStub stub, File file ) { - this.file = file; - SAMFileWriterFactory factory = new SAMFileWriterFactory(); - // Enable automatic index creation for pre-sorted BAMs. - if (stub.getFileHeader().getSortOrder().equals(SAMFileHeader.SortOrder.coordinate) && stub.getIndexOnTheFly()) - factory.setCreateIndex(true); - if (stub.getGenerateMD5()) - factory.setCreateMd5File(true); - // Adjust max records in RAM. - // TODO -- this doesn't actually work because of a bug in Picard; do not use until fixed - if(stub.getMaxRecordsInRam() != null) - factory.setMaxRecordsInRam(stub.getMaxRecordsInRam()); - - if(stub.getOutputFile() != null) { - try { - this.writer = createBAMWriter(factory,stub.getFileHeader(),stub.isPresorted(),file,stub.getCompressionLevel()); - } - catch(RuntimeIOException ex) { - throw new UserException.CouldNotCreateOutputFile(file,"file could not be created",ex); - } - } - else if(stub.getOutputStream() != null){ - this.writer = factory.makeSAMWriter( stub.getFileHeader(), stub.isPresorted(), stub.getOutputStream()); - } - else - throw new UserException("Unable to write to SAM file; neither a target file nor a stream has been specified"); - - // if we want to send the BAM file through the simplifying writer, wrap it here - if ( stub.simplifyBAM() ) { - this.writer = new SimplifyingSAMFileWriter(this.writer); - } - } - - public SAMFileHeader getFileHeader() { - return writer.getFileHeader(); - } - - public void addAlignment( SAMRecord read ) { - writer.addAlignment(read); - } - - public void close() { - try { - writer.close(); - } catch (RuntimeIOException e) { - throw new UserException.ErrorWritingBamFile(e.getMessage()); - } - } - - public void mergeInto( SAMFileWriter targetStream ) { - SAMFileReader reader = new SAMFileReader( file ); - try { - CloseableIterator iterator = reader.iterator(); - while( iterator.hasNext() ) - targetStream.addAlignment( iterator.next() ); - iterator.close(); - } - finally { - reader.close(); - file.delete(); - } - } - - private SAMFileWriter createBAMWriter(final SAMFileWriterFactory factory, - final SAMFileHeader header, - final boolean presorted, - final File outputFile, - final Integer compressionLevel) { - SAMFileWriter writer; - if(compressionLevel != null) - writer = factory.makeBAMWriter(header, presorted, outputFile, compressionLevel); - else - writer = factory.makeBAMWriter(header, presorted, outputFile); - - // mhanna - 1 Mar 2011 - temporary hack until Picard generates an index file for empty BAMs -- - // - do a pre-initialization of the BAM file. - try { - Method prepareToWriteAlignmentsMethod = writer.getClass().getDeclaredMethod("prepareToWriteAlignments"); - if(prepareToWriteAlignmentsMethod != null) { - prepareToWriteAlignmentsMethod.setAccessible(true); - prepareToWriteAlignmentsMethod.invoke(writer); - } - } - catch(NoSuchMethodException ex) { - logger.info("Unable to call prepareToWriteAlignments method; this should be reviewed when Picard is updated."); - } - catch(IllegalAccessException ex) { - logger.info("Unable to access prepareToWriteAlignments method; this should be reviewed when Picard is updated."); - } - catch(InvocationTargetException ex) { - logger.info("Unable to invoke prepareToWriteAlignments method; this should be reviewed when Picard is updated."); - } - - return writer; - } - - @Override - public void setProgressLogger(final ProgressLoggerInterface logger) { - writer.setProgressLogger(logger); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/storage/VariantContextWriterStorage.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/storage/VariantContextWriterStorage.java deleted file mode 100644 index c4f776915..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/storage/VariantContextWriterStorage.java +++ /dev/null @@ -1,228 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.io.storage; - -import htsjdk.samtools.util.BlockCompressedOutputStream; -import org.apache.log4j.Logger; -import htsjdk.tribble.AbstractFeatureReader; -import htsjdk.tribble.Feature; -import htsjdk.tribble.FeatureCodec; -import org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub; -import org.broadinstitute.gatk.engine.refdata.tracks.FeatureManager; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import htsjdk.variant.bcf2.BCF2Utils; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.writer.Options; -import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory; -import htsjdk.variant.vcf.VCFHeader; - -import java.io.*; -import java.util.Arrays; -import java.util.EnumSet; -import java.util.List; - -/** - * Provides temporary and permanent storage for genotypes in VCF format. - * - * @author mhanna - * @version 0.1 - */ -public class VariantContextWriterStorage implements Storage, VariantContextWriter { - /** - * our log, which we want to capture anything from this class - */ - private static Logger logger = Logger.getLogger(VariantContextWriterStorage.class); - - private final static int BUFFER_SIZE = 1048576; - - protected final File file; - protected OutputStream stream; - protected final VariantContextWriter writer; - boolean closed = false; - - /** - * Constructs an object which will write directly into the output file provided by the stub. - * Intentionally delaying the writing of the header -- this should be filled in by the walker. - * - * Respecs the isCompressed() request in stub, so if isCompressed() is true then this - * will create a storage output that dumps output to a BlockCompressedOutputStream. - * - * @param stub Stub to use when constructing the output file. - */ - public VariantContextWriterStorage(VariantContextWriterStub stub) { - if ( stub.getOutputFile() != null ) { - this.file = stub.getOutputFile(); - writer = vcfWriterToFile(stub,stub.getOutputFile(),true,true); - } - else if ( stub.getOutputStream() != null ) { - this.file = null; - this.stream = stub.getOutputStream(); - writer = VariantContextWriterFactory.create(stream, - stub.getMasterSequenceDictionary(), stub.getWriterOptions(false)); - } - else - throw new ReviewedGATKException("Unable to create target to which to write; storage was provided with neither a file nor a stream."); - } - - /** - * Constructs an object which will redirect into a different file. - * - * Note that this function does not respect the isCompressed() request from the stub, in order - * to ensure that tmp. files can be read back in by the Tribble system, and merged with the mergeInto function. - * - * @param stub Stub to use when synthesizing file / header info. - * @param tempFile File into which to direct the output data. - */ - public VariantContextWriterStorage(VariantContextWriterStub stub, File tempFile) { - //logger.debug("Creating temporary output file " + tempFile.getAbsolutePath() + " for VariantContext output."); - this.file = tempFile; - this.writer = vcfWriterToFile(stub, file, false, false); - writer.writeHeader(stub.getVCFHeader()); - } - - /** - * common initialization routine for multiple constructors - * @param stub Stub to use when constructing the output file. - * @param file Target file into which to write VCF records. - * @param indexOnTheFly true to index the file on the fly. NOTE: will be forced to false for compressed files. - * @param allowCompressed if false, we won't compress the output, even if the stub requests it. Critical - * for creating temp. output files that will be subsequently merged, as these do not - * support compressed output - * @return A VCF writer for use with this class - */ - private VariantContextWriter vcfWriterToFile(final VariantContextWriterStub stub, - final File file, - final boolean indexOnTheFly, - final boolean allowCompressed) { - try { - // we cannot merge compressed outputs, so don't compress if allowCompressed is false, - // which is the case when we have a temporary output file for later merging - if ( allowCompressed && stub.isCompressed() ) - stream = new BlockCompressedOutputStream(file); - else - stream = new PrintStream(new BufferedOutputStream(new FileOutputStream(file), BUFFER_SIZE)); - } - catch(IOException ex) { - throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream", ex); - } - - EnumSet options = stub.getWriterOptions(indexOnTheFly); - VariantContextWriter writer = VariantContextWriterFactory.create(file, this.stream, stub.getMasterSequenceDictionary(), stub.getIndexCreator(), options); - - // if the stub says to test BCF, create a secondary writer to BCF and an 2 way out writer to send to both - // TODO -- remove me when argument generateShadowBCF is removed - if ( stub.alsoWriteBCFForTest() && ! VariantContextWriterFactory.isBCFOutput(file, options)) { - final File bcfFile = BCF2Utils.shadowBCF(file); - if ( bcfFile != null ) { - FileOutputStream bcfStream; - try { - bcfStream = new FileOutputStream(bcfFile); - } catch (FileNotFoundException e) { - throw new RuntimeException(bcfFile + ": Unable to create BCF writer", e); - } - - VariantContextWriter bcfWriter = VariantContextWriterFactory.create(bcfFile, bcfStream, stub.getMasterSequenceDictionary(), stub.getIndexCreator(), options); - writer = new TestWriter(writer, bcfWriter); - } - } - - return writer; - } - - private final static class TestWriter implements VariantContextWriter { - final List writers; - - private TestWriter(final VariantContextWriter ... writers) { - this.writers = Arrays.asList(writers); - } - - @Override - public void writeHeader(final VCFHeader header) { - for ( final VariantContextWriter writer : writers ) writer.writeHeader(header); - } - - @Override - public void close() { - for ( final VariantContextWriter writer : writers ) writer.close(); - } - - @Override - public void add(final VariantContext vc) { - for ( final VariantContextWriter writer : writers ) writer.add(vc); - } - } - - public void add(VariantContext vc) { - if ( closed ) throw new ReviewedGATKException("Attempting to write to a closed VariantContextWriterStorage " + vc.getStart() + " storage=" + this); - writer.add(vc); - } - - /** - * initialize this VCF header - * - * @param header the header - */ - public void writeHeader(VCFHeader header) { - writer.writeHeader(header); - } - - /** - * Close the VCF storage object. - */ - public void close() { - writer.close(); - closed = true; - } - - public void mergeInto(VariantContextWriterStorage target) { - try { - if ( ! closed ) - throw new ReviewedGATKException("Writer not closed, but we are merging into the file!"); - final String targetFilePath = target.file != null ? target.file.getAbsolutePath() : "/dev/stdin"; - logger.debug(String.format("Merging VariantContextWriterStorage from %s into %s", file.getAbsolutePath(), targetFilePath)); - - // use the feature manager to determine the right codec for the tmp file - // that way we don't assume it's a specific type - final FeatureManager.FeatureDescriptor fd = new FeatureManager().getByFiletype(file); - if ( fd == null ) - throw new UserException.LocalParallelizationProblem(file); - - final FeatureCodec codec = fd.getCodec(); - final AbstractFeatureReader source = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), codec, false); - - for ( final Feature vc : source.iterator() ) { - target.writer.add((VariantContext) vc); - } - - source.close(); - file.delete(); // this should be last to aid in debugging when the process fails - } catch (IOException e) { - throw new UserException.CouldNotReadInputFile(file, "Error reading file in VCFWriterStorage: ", e); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileReaderArgumentTypeDescriptor.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileReaderArgumentTypeDescriptor.java deleted file mode 100644 index 42397cb9a..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileReaderArgumentTypeDescriptor.java +++ /dev/null @@ -1,77 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.io.stubs; - -import htsjdk.samtools.SAMFileReader; -import org.broadinstitute.gatk.utils.commandline.*; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.sam.SAMFileReaderBuilder; - -import java.lang.reflect.Type; - -/** - * Describe how to parse SAMFileReaders. - */ -public class SAMFileReaderArgumentTypeDescriptor extends ArgumentTypeDescriptor { - /** - * The engine into which output stubs should be fed. - */ - private GenomeAnalysisEngine engine; - - /** - * Create a new SAMFileReader argument, notifying the given engine when that argument has been created. - * @param engine engine - */ - public SAMFileReaderArgumentTypeDescriptor( GenomeAnalysisEngine engine ) { - this.engine = engine; - } - - @Override - public boolean supports( Class type ) { - return SAMFileReader.class.isAssignableFrom(type); - } - - @Override - public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches ) { - SAMFileReaderBuilder builder = new SAMFileReaderBuilder(); - - ArgumentMatchValue readerFileName = getArgumentValue( createDefaultArgumentDefinition(source), matches ); - - if( readerFileName == null ) - throw new UserException.CommandLineException("SAM file compression was supplied, but no associated writer was supplied with it."); - - builder.setSAMFile(readerFileName.asFile()); - - // WARNING: Skipping required side-effect because stub is impossible to generate. - engine.addInput(source, builder); - - // MASSIVE KLUDGE! SAMFileReader is tricky to implement and we don't yet have a stub. Return null, then - // let the output tracker load it in. - // TODO: Add a stub for SAMFileReader. - return null; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterArgumentTypeDescriptor.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterArgumentTypeDescriptor.java deleted file mode 100644 index c45432471..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterArgumentTypeDescriptor.java +++ /dev/null @@ -1,106 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.io.stubs; - -import htsjdk.samtools.SAMFileWriter; -import org.broadinstitute.gatk.utils.commandline.*; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.io.OutputStream; -import java.lang.reflect.Type; - -/** - * Insert a SAMFileWriterStub instead of a full-fledged concrete OutputStream implementations. - */ -public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor { - - /** - * The engine into which output stubs should be fed. - */ - private final GenomeAnalysisEngine engine; - - /** - * The default location to which data should be written if the user specifies no such location. - */ - private final OutputStream defaultOutputStream; - - /** - * Create a new SAMFileWriter argument, notifying the given engine when that argument has been created. - * @param engine Engine to add SAMFileWriter output to. - * @param defaultOutputStream the target for the data - */ - public SAMFileWriterArgumentTypeDescriptor( GenomeAnalysisEngine engine, OutputStream defaultOutputStream ) { - this.engine = engine; - this.defaultOutputStream = defaultOutputStream; - } - - @Override - public boolean supports( Class type ) { - return SAMFileWriter.class.equals(type) || GATKSAMFileWriter.class.equals(type); - } - - @Override - public boolean createsTypeDefault(ArgumentSource source) { - return !source.isRequired() && source.defaultsToStdout(); - } - - @Override - public String typeDefaultDocString(ArgumentSource source) { - return "stdout"; - } - - @Override - public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { - if(source.isRequired() || !source.defaultsToStdout()) - throw new ReviewedGATKException("BUG: tried to create type default for argument type descriptor that can't support a type default."); - SAMFileWriterStub stub = new SAMFileWriterStub(engine,defaultOutputStream); - engine.addOutput(stub); - return stub; - } - - @Override - public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches ) { - // Extract all possible parameters that could be passed to a BAM file writer? - ArgumentDefinition bamArgumentDefinition = createDefaultArgumentDefinition(source); - ArgumentMatchValue writerFileName = getArgumentValue( bamArgumentDefinition, matches ); - - // Create the stub - SAMFileWriterStub stub = null; // stub = new SAMFileWriterStub(engine, defaultOutputStream); - - if (writerFileName != null && writerFileName.asFile() != null ) { - stub = new SAMFileWriterStub(engine, writerFileName.asFile()); - - // WARNING: Side effects required by engine! - parsingEngine.addTags(stub,getArgumentTags(matches)); - engine.addOutput(stub); - } - - return stub; - } - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterStub.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterStub.java deleted file mode 100644 index cc814e9e6..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterStub.java +++ /dev/null @@ -1,336 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.io.stubs; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMFileWriter; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.util.ProgressLoggerInterface; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; -import org.broadinstitute.gatk.engine.io.OutputTracker; -import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; -import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.utils.baq.BAQ; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.io.File; -import java.io.OutputStream; -import java.util.ArrayList; -import java.util.List; - -/** - * A stub for routing and management of SAM file reading and writing. - * - * @author mhanna - * @version 0.1 - */ -public class SAMFileWriterStub implements Stub, GATKSAMFileWriter { - /** - * Engine to use for collecting attributes for the output SAM file. - */ - private final GenomeAnalysisEngine engine; - - /** - * A header supplied by the user that overrides the merged header from the input BAM. - */ - private SAMFileHeader headerOverride = null; - - /** - * The sam file that this stub should write to. Should be passed along to - * whatever happens to create the StreamConnector. - */ - private final File samFile; - - /** - * The target output stream, to be used in place of the SAM file. - */ - private final OutputStream samOutputStream; - - /** - * The validation stringency to apply when reading this file. - */ - private Integer compressionLevel = null; - - /** - * Should the GATK index the output BAM on-the-fly? - */ - private boolean indexOnTheFly = false; - - /** - * Should the GATK generate an md5 for the output BAM? - */ - private boolean generateMD5 = false; - - /** - * Should this BAM be presorted? - */ - private boolean presorted = true; - - /** - * How many records should the BAM writer store in RAM while - * sorting the BAM on-the-fly? - */ - private Integer maxRecordsInRam = null; - - /** - * Connects this stub with an external stream capable of serving the - * requests of the consumer of this stub. - */ - private OutputTracker outputTracker = null; - - /** - * Has the write started? If so, throw an exception if someone tries to - * change write parameters to the file (compression level, presorted flag, - * header, etc). - */ - private boolean writeStarted = false; - - - /** - * HMM for BAQ, if needed - */ - BAQ baqHMM = new BAQ(); - - /** - * Should we simplify the BAM file while writing it out? - */ - private boolean simplifyBAM = false; - - private List onOutputReadTransformers = null; - - /** - * Create a new stub given the requested SAM file and compression level. - * @param engine source of header data, maybe other data about input files. - * @param samFile SAM file to (ultimately) create. - */ - public SAMFileWriterStub( GenomeAnalysisEngine engine, File samFile ) { - this(engine, samFile, null); - } - - /** - * Create a new stub given the requested SAM file and compression level. - * @param engine source of header data, maybe other data about input files. - * @param stream Output stream to which data should be written. - */ - public SAMFileWriterStub( GenomeAnalysisEngine engine, OutputStream stream ) { - this(engine, null, stream); - } - - private SAMFileWriterStub(final GenomeAnalysisEngine engine, final File samFile, final OutputStream stream) { - this.engine = engine; - this.samFile = samFile; - this.samOutputStream = stream; - } - - /** - * Retrieves the SAM file to (ultimately) be created. - * @return The SAM file. Must not be null. - */ - public File getOutputFile() { - return samFile; - } - - public boolean simplifyBAM() { - return simplifyBAM; - } - - public void setSimplifyBAM(boolean v) { - simplifyBAM = v; - } - - public OutputStream getOutputStream() { - return samOutputStream; - } - - /** - * Retrieves the header to use when creating the new SAM file. - * @return header to use when creating the new SAM file. - */ - public SAMFileHeader getFileHeader() { - return headerOverride != null ? headerOverride : engine.getSAMFileHeader(); - } - - /** - * Retrieves the desired compression level for - * @return The current compression level. Could be null if the user doesn't care. - */ - public Integer getCompressionLevel() { - return compressionLevel; - } - - /** - * Sets the desired compression level. - * @param compressionLevel The suggested compression level. - */ - public void setCompressionLevel( Integer compressionLevel ) { - if(writeStarted) - throw new ReviewedGATKException("Attempted to change the compression level of a file with alignments already in it."); - this.compressionLevel = compressionLevel; - } - - /** - * Gets whether to index this output stream on-the-fly. - * @return True means create an index. False means skip index creation. - */ - public Boolean getIndexOnTheFly() { - return indexOnTheFly; - } - - /** - * Controls whether to index this output stream on-the-fly. - * @param indexOnTheFly True means create an index. False means skip index creation. - */ - public void setIndexOnTheFly( boolean indexOnTheFly ) { - if(writeStarted) - throw new UserException("Attempted to index a BAM on the fly of a file with alignments already in it."); - this.indexOnTheFly = indexOnTheFly; - } - - /** - * Gets whether to generate an md5 on-the-fly for this BAM. - * @return True generates the md5. False means skip writing the file. - */ - public Boolean getGenerateMD5() { - return generateMD5; - } - - /** - * Gets whether to generate an md5 on-the-fly for this BAM. - * @param generateMD5 True generates the md5. False means skip writing the file. - */ - public void setGenerateMD5(boolean generateMD5) { - if(writeStarted) - throw new UserException("Attempted to turn on md5 generation for BAM file with alignments already in it."); - this.generateMD5 = generateMD5; - } - - /** - * Whether the BAM file to create is actually presorted. - * @return True if the BAM file is presorted. False otherwise. - */ - public boolean isPresorted() { - return this.presorted; - } - - /** - * Set Whether the BAM file to create is actually presorted. - * @param presorted True if the BAM file is presorted. False otherwise. - */ - public void setPresorted(boolean presorted) { - if(writeStarted) - throw new ReviewedGATKException("Attempted to change the presorted state of a file with alignments already in it."); - this.presorted = presorted; - } - - /** - * Get the maximum number of reads to hold in RAM when sorting a BAM on-the-fly. - * @return Max records in RAM, or null if unset. - */ - public Integer getMaxRecordsInRam() { - return this.maxRecordsInRam; - } - - /** - * Sets the maximum number of reads to hold in RAM when sorting a BAM on-the-fly. - * @param maxRecordsInRam Max number of records in RAM. - */ - public void setMaxRecordsInRam(int maxRecordsInRam) { - if(writeStarted) - throw new ReviewedGATKException("Attempted to change the max records in RAM of a file with alignments already in it."); - this.maxRecordsInRam = maxRecordsInRam; - } - - /** - * Registers the given streamConnector with this stub. - * @param outputTracker The connector used to provide an appropriate stream. - */ - public void register( OutputTracker outputTracker ) { - this.outputTracker = outputTracker; - } - - @Override - public void processArguments( final GATKArgumentCollection argumentCollection ) { - if (argumentCollection.bamCompression != null) - setCompressionLevel(argumentCollection.bamCompression); - setGenerateMD5(argumentCollection.enableBAMmd5); - setIndexOnTheFly(!argumentCollection.disableBAMIndexing); - setSimplifyBAM(argumentCollection.simplifyBAM); - - } - - /** - * Use the given header as the target for this writer. - * @param header The header to write. - */ - public void writeHeader(SAMFileHeader header) { - if(writeStarted) - throw new ReviewedGATKException("Attempted to change the header of a file with alignments already in it."); - this.headerOverride = header; - } - - private void initializeReadTransformers() { - this.onOutputReadTransformers = new ArrayList<>(engine.getReadTransformers().size()); - for ( final ReadTransformer transformer : engine.getReadTransformers() ) { - if ( transformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_OUTPUT ) - onOutputReadTransformers.add(transformer); - } - } - - /** - * @{inheritDoc} - */ - public void addAlignment( final SAMRecord readIn ) { - if ( onOutputReadTransformers == null ) - initializeReadTransformers(); - - GATKSAMRecord workingRead = (GATKSAMRecord)readIn; - - // run on output read transformers - for ( final ReadTransformer transform : onOutputReadTransformers ) - workingRead = transform.apply(workingRead); - - writeStarted = true; - outputTracker.getStorage(this).addAlignment(workingRead); - } - - /** - * @{inheritDoc} - */ - public void close() { - outputTracker.getStorage(this).close(); - } - - /** - * @throws java.lang.UnsupportedOperationException No progress logging in this implementation. - */ - @Override - public void setProgressLogger(final ProgressLoggerInterface logger) { - throw new UnsupportedOperationException("Progress logging not supported"); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VCFWriterArgumentTypeDescriptor.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VCFWriterArgumentTypeDescriptor.java deleted file mode 100644 index 686133922..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VCFWriterArgumentTypeDescriptor.java +++ /dev/null @@ -1,148 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.io.stubs; - -import htsjdk.tribble.AbstractFeatureReader; -import org.broadinstitute.gatk.utils.commandline.*; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.io.File; -import java.io.OutputStream; -import java.lang.reflect.Type; -import java.util.Collection; - -/** - * Injects new command-line arguments into the system providing support for the genotype writer. - * - * @author mhanna - * @version 0.1 - */ -public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor { - - /** - * The engine into which output stubs should be fed. - */ - private final GenomeAnalysisEngine engine; - - /** - * The default location to which data should be written if the user specifies no such location. - */ - private final OutputStream defaultOutputStream; - - /** - * The sources into which arguments were injected. - */ - private final Collection argumentSources; - - /** - * Create a new GenotypeWriter argument, notifying the given engine when that argument has been created. - * @param engine the engine to be notified. - * @param defaultOutputStream the default output stream to be written to if nothing else is specified. - * @param argumentSources sources from which command-line arguments should be derived. - */ - public VCFWriterArgumentTypeDescriptor(GenomeAnalysisEngine engine, OutputStream defaultOutputStream, Collection argumentSources) { - this.engine = engine; - this.defaultOutputStream = defaultOutputStream; - this.argumentSources = argumentSources; - } - - /** - * Reports whether this ArgumentTypeDescriptor supports the given type. - * @param type The type to check. - * @return True if the argument is a GenotypeWriter. - */ - @Override - public boolean supports( Class type ) { - return VariantContextWriter.class.equals(type); - } - - /** - * This command-line argument descriptor does want to override the provided default value. - * @return true always. - */ - @Override - public boolean createsTypeDefault(ArgumentSource source) { - return !source.isRequired() && source.defaultsToStdout(); - } - - @Override - public String typeDefaultDocString(ArgumentSource source) { - return "stdout"; - } - - @Override - public Object createTypeDefault(ParsingEngine parsingEngine, ArgumentSource source, Type type) { - if(source.isRequired() || !source.defaultsToStdout()) - throw new ReviewedGATKException("BUG: tried to create type default for argument type descriptor that can't support a type default."); - VariantContextWriterStub stub = new VariantContextWriterStub(engine, defaultOutputStream, argumentSources); - engine.addOutput(stub); - return stub; - } - - /** - * Convert the given argument matches into a single object suitable for feeding into the ArgumentSource. - * @param source Source for this argument. - * @param type not used - * @param matches Matches that match with this argument. - * @return Transform from the matches into the associated argument. - */ - @Override - public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches ) { - ArgumentDefinition defaultArgumentDefinition = createDefaultArgumentDefinition(source); - // Get the filename for the genotype file, if it exists. If not, we'll need to send output to out. - ArgumentMatchValue writerFileName = getArgumentValue(defaultArgumentDefinition,matches); - File writerFile = writerFileName != null ? writerFileName.asFile() : null; - - // This parser has been passed a null filename and the GATK is not responsible for creating a type default for the object; - // therefore, the user must have failed to specify a type default - if(writerFile == null && source.isRequired()) - throw new MissingArgumentValueException(defaultArgumentDefinition); - - // Create a stub for the given object. - final VariantContextWriterStub stub = (writerFile != null) - ? new VariantContextWriterStub(engine, writerFile, argumentSources) - : new VariantContextWriterStub(engine, defaultOutputStream, argumentSources); - - stub.setCompressed(isCompressed(writerFileName == null ? null: writerFileName.asString())); - - // WARNING: Side effects required by engine! - parsingEngine.addTags(stub,getArgumentTags(matches)); - engine.addOutput(stub); - - return stub; - } - - /** - * Returns true if the file will be compressed. - * @param writerFileName Name of the file - * @return true if the file will be compressed. - */ - public static boolean isCompressed(String writerFileName) { - return writerFileName != null && AbstractFeatureReader.hasBlockCompressedExtension(writerFileName); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VariantContextWriterStub.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VariantContextWriterStub.java deleted file mode 100644 index f40ede581..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VariantContextWriterStub.java +++ /dev/null @@ -1,301 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.io.stubs; - -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.tribble.index.IndexCreator; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; -import org.broadinstitute.gatk.engine.io.OutputTracker; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.writer.Options; -import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory; -import htsjdk.variant.vcf.VCFHeader; -import htsjdk.variant.vcf.VCFHeaderLine; - -import java.io.File; -import java.io.OutputStream; -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Collection; -import java.util.EnumSet; -import java.util.List; - -/** - * A stub for routing and management of genotype reading and writing. - * - * @author ebanks - * @version 0.1 - */ -public class VariantContextWriterStub implements Stub, VariantContextWriter { - public final static boolean UPDATE_CONTIG_HEADERS = true; - - /** - * The engine, central to the GATK's processing. - */ - private final GenomeAnalysisEngine engine; - - /** - * The file that this stub should write to. Should be mutually - * exclusive with genotypeStream. - */ - private final File genotypeFile; - - /** - * The output stream to which stub data should be written. Will be - * mutually exclusive with genotypeFile. - */ - private final PrintStream genotypeStream; - - /** - * A hack: push the argument sources into the VCF header so that the VCF header - * can rebuild the command-line arguments. - */ - private final Collection argumentSources; - - /** - * Which IndexCreator to use - */ - private final IndexCreator indexCreator; - - /** - * The cached VCF header (initialized to null) - */ - private VCFHeader vcfHeader = null; - - /** - * Should we emit a compressed output stream? - */ - private boolean isCompressed = false; - - /** - * Should the header be written out? A hidden argument. - */ - private boolean skipWritingCommandLineHeader = false; - - /** - * Should we not write genotypes even when provided? - */ - private boolean doNotWriteGenotypes = false; - - /** - * Should we force BCF writing regardless of the file extension? - */ - private boolean forceBCF = false; - - /** - * Should we write all of the fields in the FORMAT field, even if missing fields could be trimmed? - */ - private boolean writeFullFormatField = false; - - /** - * Connects this stub with an external stream capable of serving the - * requests of the consumer of this stub. - */ - protected OutputTracker outputTracker = null; - - /** - * Create a new stub given the requested file. - * - * @param engine engine. - * @param genotypeFile file to (ultimately) create. - * @param argumentSources sources. - */ - public VariantContextWriterStub(GenomeAnalysisEngine engine, File genotypeFile, Collection argumentSources) { - this.engine = engine; - this.genotypeFile = genotypeFile; - this.genotypeStream = null; - this.indexCreator = GATKVCFUtils.getIndexCreator(engine.getArguments().variant_index_type, engine.getArguments().variant_index_parameter, genotypeFile); - this.argumentSources = argumentSources; - } - - /** - * Create a new stub given the requested file. - * - * @param engine engine. - * @param genotypeStream stream to (ultimately) write. - * @param argumentSources sources. - */ - public VariantContextWriterStub(GenomeAnalysisEngine engine, OutputStream genotypeStream, Collection argumentSources) { - this.engine = engine; - this.genotypeFile = null; - this.genotypeStream = new PrintStream(genotypeStream); - this.indexCreator = null; - this.argumentSources = argumentSources; - } - - /** - * Retrieves the file to (ultimately) be created. - * @return The file. Can be null if genotypeStream is not. - */ - public File getOutputFile() { - return genotypeFile; - } - - /** - * Retrieves the output stream to which to (ultimately) write. - * @return The file. Can be null if genotypeFile is not. - */ - public OutputStream getOutputStream() { - return genotypeStream; - } - - public boolean isCompressed() { - return isCompressed; - } - - public void setCompressed(final boolean compressed) { - isCompressed = compressed; - } - - public void setSkipWritingCommandLineHeader(final boolean skipWritingCommandLineHeader) { - this.skipWritingCommandLineHeader = skipWritingCommandLineHeader; - } - - public void setDoNotWriteGenotypes(final boolean doNotWriteGenotypes) { - this.doNotWriteGenotypes = doNotWriteGenotypes; - } - - public void setForceBCF(final boolean forceBCF) { - this.forceBCF = forceBCF; - } - - public void setWriteFullFormatField(final boolean writeFullFormatField) { - this.writeFullFormatField = writeFullFormatField; - } - - public IndexCreator getIndexCreator() { - return indexCreator; - } - - /** - * Gets the master sequence dictionary from the engine associated with this stub - * @link GenomeAnalysisEngine.getMasterSequenceDictionary - * @return the master sequence dictionary from the engine associated with this stub - */ - public SAMSequenceDictionary getMasterSequenceDictionary() { - return engine.getMasterSequenceDictionary(); - } - - public EnumSet getWriterOptions() { - return getWriterOptions(false); - } - - public EnumSet getWriterOptions(boolean indexOnTheFly) { - final List options = new ArrayList<>(); - - if ( doNotWriteGenotypes ) options.add(Options.DO_NOT_WRITE_GENOTYPES); - if ( engine.lenientVCFProcessing() ) options.add(Options.ALLOW_MISSING_FIELDS_IN_HEADER); - if ( indexOnTheFly) options.add(Options.INDEX_ON_THE_FLY); - if ( writeFullFormatField ) options.add(Options.WRITE_FULL_FORMAT_FIELD); - - if ( forceBCF || (getOutputFile() != null && VariantContextWriterFactory.isBCFOutput(getOutputFile())) ) - options.add(Options.FORCE_BCF); - - return options.isEmpty() ? EnumSet.noneOf(Options.class) : EnumSet.copyOf(options); - } - - /** - * Retrieves the header to use when creating the new file. - * @return header to use when creating the new file. - */ - public VCFHeader getVCFHeader() { - return vcfHeader; - } - - /** - * Registers the given streamConnector with this stub. - * @param outputTracker The connector used to provide an appropriate stream. - */ - public void register( OutputTracker outputTracker ) { - this.outputTracker = outputTracker; - } - - @Override - public void processArguments( final GATKArgumentCollection argumentCollection ) { - setDoNotWriteGenotypes(argumentCollection.sitesOnlyVCF); - setSkipWritingCommandLineHeader(argumentCollection.disableCommandLineInVCF); - setForceBCF(argumentCollection.forceBCFOutput); - setWriteFullFormatField(argumentCollection.neverTrimVCFFormatField); - } - - public void writeHeader(VCFHeader header) { - vcfHeader = header; - - if ( header.isWriteEngineHeaders() ) { - // skip writing the command line header if requested - if ( ! skipWritingCommandLineHeader && header.isWriteCommandLine() ) { - // Always add the header line, as the current format allows multiple entries - final VCFHeaderLine commandLineArgHeaderLine = GATKVCFUtils.getCommandLineArgumentHeaderLine(engine, argumentSources); - vcfHeader.addMetaDataLine(commandLineArgHeaderLine); - } - - if ( UPDATE_CONTIG_HEADERS ) - vcfHeader = GATKVCFUtils.withUpdatedContigs(vcfHeader, engine); - } - - outputTracker.getStorage(this).writeHeader(vcfHeader); - } - - /** - * @{inheritDoc} - */ - public void add(VariantContext vc) { - outputTracker.getStorage(this).add(vc); - } - - /** - * @{inheritDoc} - */ - public void close() { - outputTracker.getStorage(this).close(); - } - - /** - * Gets a string representation of this object. - * @return a string representation of this object. - */ - @Override - public String toString() { - return getClass().getName(); - } - - /** - * Should we also write a BCF file alongside our VCF file for testing - * - * TODO -- remove me when argument generateShadowBCF is removed - * - * @return - */ - public boolean alsoWriteBCFForTest() { - return engine.getArguments().numberOfDataThreads == 1 && // only works single threaded - ! isCompressed() && // for non-compressed outputs - getOutputFile() != null && // that are going to disk - engine.getArguments().generateShadowBCF; // and we actually want to do it - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/BoundedReadIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/BoundedReadIterator.java deleted file mode 100644 index cb696e58e..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/BoundedReadIterator.java +++ /dev/null @@ -1,159 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.iterators; - -import htsjdk.samtools.MergingSamRecordIterator; -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMRecord; - -import java.util.Iterator; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - - -/** - * @author aaron - * @version 1.0 - * @date Apr 14, 2009 - *

    - * Class BoundedReadIterator - *

    - * This class implements a read iterator that is bounded by the number of reads - * it will produce over the iteration. - */ -public class BoundedReadIterator implements GATKSAMIterator { - - // the genome loc we're bounding - final private long readCount; - private long currentCount = 0; - - // the iterator we want to decorate - private final GATKSAMIterator iterator; - - // our unmapped read flag - private boolean doNotUseThatUnmappedReadPile = false; - - /** - * The next read that we've buffered. Null indicates that there's - * nothing in the buffer (not that there isn't a next read). - */ - private SAMRecord record = null; - - /** - * constructor - * @param iter - * @param readCount - */ - public BoundedReadIterator(GATKSAMIterator iter, long readCount) { - this.iterator = iter; - this.readCount = readCount; - } - - public void useUnmappedReads(boolean useThem) { - this.doNotUseThatUnmappedReadPile = useThem; - } - - public SAMFileHeader getHeader() { - // todo: this is bad, we need an iterface out there for samrecords that supports getting the header, - // regardless of the merging - if (iterator instanceof MergingSamRecordIterator) - return ((MergingSamRecordIterator)iterator).getMergedHeader(); - else - return null; - } - - /** - * Do we have a next? If the iterator has a read and we're not over the read - * count, then yes - * @return - */ - public boolean hasNext() { - if( record != null ) - return true; - - if (iterator.hasNext() && currentCount < readCount) { - record = iterator.next(); - ++currentCount; - if (record.getAlignmentStart() == 0 && doNotUseThatUnmappedReadPile) { - return false; - } - return true; - } else { - return false; - } - } - - /** - * get the next SAMRecord - * @return SAMRecord representing the next read - */ - public SAMRecord next() { - SAMRecord cached = record; - record = null; - return cached; - } - - /** - * this is unsupported on SAMRecord iterators - */ - public void remove() { - throw new UnsupportedOperationException("You cannot use an iterator to remove a SAMRecord"); - } - - /** - * close the iterator - */ - public void close() { - iterator.close(); - } - - public Iterator iterator() { - return this; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/GATKSAMIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/GATKSAMIterator.java deleted file mode 100644 index 8ca5cfdbe..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/GATKSAMIterator.java +++ /dev/null @@ -1,56 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.iterators; - -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.util.CloseableIterator; -/** - * - * User: aaron - * Date: May 6, 2009 - * Time: 5:30:41 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - -/** - * @author aaron - * @version 1.0 - * @date May 6, 2009 - *

    - * Interface GATKSAMIterator - *

    - * This is the standard interface for all iterators in the GATK package that iterate over SAMRecords - */ -public interface GATKSAMIterator extends CloseableIterator, Iterable { -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/GATKSAMIteratorAdapter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/GATKSAMIteratorAdapter.java deleted file mode 100644 index 0dc3e62a7..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/GATKSAMIteratorAdapter.java +++ /dev/null @@ -1,136 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.iterators; - -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.util.CloseableIterator; - -import java.util.Iterator; - -/** - * - * User: aaron - * Date: May 13, 2009 - * Time: 6:33:15 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - - -/** - * @author aaron - * @version 1.0 - * @date May 13, 2009 - *

    - * Class GATKSAMIteratorAdapter - *

    - * This class adapts other SAMRecord iterators to the GATKSAMIterator - */ -public class GATKSAMIteratorAdapter { - - public static GATKSAMIterator adapt(Iterator iter) { - return new PrivateStringSAMIterator(iter); - } - - public static GATKSAMIterator adapt(CloseableIterator iter) { - return new PrivateStringSAMCloseableIterator(iter); - } - -} - - -/** - * this class wraps iterators in a GATKSAMIterator, which means just adding the - * methods that implement the iterable<> interface and the close() method from CloseableIterator - */ -class PrivateStringSAMIterator implements GATKSAMIterator { - private Iterator iter = null; - - PrivateStringSAMIterator(Iterator iter) { - this.iter = iter; - } - - public void close() { - // do nothing, we can't close the iterator anyway. - } - - public boolean hasNext() { - return iter.hasNext(); - } - - public SAMRecord next() { - return iter.next(); - } - - public void remove() { - throw new UnsupportedOperationException("GATKSAMIterator's don't allow remove()ing"); - } - - public Iterator iterator() { - return iter; - } -} - - -/** - * this class wraps closeable iterators in a GATKSAMIterator, which means adding the - * methods that implement the iterable<> interface. - */ -class PrivateStringSAMCloseableIterator implements GATKSAMIterator { - private CloseableIterator iter = null; - - PrivateStringSAMCloseableIterator(CloseableIterator iter) { - this.iter = iter; - } - - public void close() { - iter.close(); - } - - public boolean hasNext() { - return iter.hasNext(); - } - - public SAMRecord next() { - return iter.next(); - } - - public void remove() { - throw new UnsupportedOperationException("GATKSAMIterator's don't allow remove()ing"); - } - - public Iterator iterator() { - return iter; - } -} - diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/GATKSAMRecordIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/GATKSAMRecordIterator.java deleted file mode 100644 index 6d02acd4a..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/GATKSAMRecordIterator.java +++ /dev/null @@ -1,57 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.iterators; - -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.util.CloseableIterator; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.Iterator; - -/** - * Temporarily hack to convert SAMRecords to GATKSAMRecords - * - * User: depristo - * Date: 1/11/13 - * Time: 1:19 PM - */ -public class GATKSAMRecordIterator implements CloseableIterator, Iterable { - final CloseableIterator it; - - public GATKSAMRecordIterator(final CloseableIterator it) { - this.it = it; - } - - public GATKSAMRecordIterator(final GATKSAMIterator it) { - this.it = it; - } - - @Override public boolean hasNext() { return it.hasNext(); } - @Override public GATKSAMRecord next() { return (GATKSAMRecord)it.next(); } - @Override public void remove() { it.remove(); } - @Override public void close() { it.close(); } - @Override public Iterator iterator() { return this; } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/NullSAMIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/NullSAMIterator.java deleted file mode 100644 index fa130f930..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/NullSAMIterator.java +++ /dev/null @@ -1,57 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.iterators; - -import htsjdk.samtools.SAMRecord; - -import java.util.Iterator; -import java.util.NoSuchElementException; -/** - * User: hanna - * Date: May 19, 2009 - * Time: 6:47:16 PM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** - * A placeholder for an iterator with no data. - */ -public class NullSAMIterator implements GATKSAMIterator { - public NullSAMIterator() {} - - public Iterator iterator() { return this; } - public void close() { /* NO-OP */ } - - public boolean hasNext() { return false; } - public SAMRecord next() { throw new NoSuchElementException("No next element is available."); } - public void remove() { throw new UnsupportedOperationException("Cannot remove from a GATKSAMIterator"); } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/PositionTrackingIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/PositionTrackingIterator.java deleted file mode 100644 index 2eba344bb..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/PositionTrackingIterator.java +++ /dev/null @@ -1,105 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.iterators; - -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.util.CloseableIterator; - -/** - * Iterates through a list of elements, tracking the number of elements it has seen. - * @author hanna - * @version 0.1 - */ -public class PositionTrackingIterator implements GATKSAMIterator { - /** - * The iterator being tracked. - */ - private CloseableIterator iterator; - - /** - * Current position within the tracked iterator. - */ - private long position; - - /** - * Retrieves the current position of the iterator. The 'current position' of the iterator is defined as - * the coordinate of the read that will be returned if next() is called. - * @return The current position of the iterator. - */ - public long getPosition() { - return position; - } - - /** - * Create a new iterator wrapping the given position, assuming that the reader is position reads - * into the sequence. - * @param iterator Iterator to wraps. - * @param position Non-negative position where the iterator currently sits. - */ - public PositionTrackingIterator(CloseableIterator iterator, long position ) { - this.iterator = iterator; - this.position = position; - } - - /** - * {@inheritDoc} - */ - public boolean hasNext() { - return iterator.hasNext(); - } - - /** - * Try to get the next read in the list. If a next read is available, increment the position. - * @return next read in the list, if available. - */ - public SAMRecord next() { - try { - return iterator.next(); - } - finally { - position++; - } - } - - /** - * {@inheritDoc} - */ - public GATKSAMIterator iterator() { - return this; - } - - /** - * {@inheritDoc} - */ - public void close() { - iterator.close(); - } - - /** - * {@inheritDoc} - */ - public void remove() { throw new UnsupportedOperationException("Cannot remove from a GATKSAMIterator"); } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/PushbackIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/PushbackIterator.java deleted file mode 100644 index 0bb545b6e..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/PushbackIterator.java +++ /dev/null @@ -1,82 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.iterators; - -import java.util.Iterator; - -public class PushbackIterator implements Iterator, Iterable { - Iterator underlyingIterator; - T pushedElement = null; - - public PushbackIterator(final Iterator underlyingIterator) { - this.underlyingIterator = underlyingIterator; - } - - public boolean hasNext() { - return pushedElement != null || underlyingIterator.hasNext(); - } - - public Iterator iterator() { - return this; - } - - /** - * Retrieves, but does not remove, the head of this iterator. - * @return T the next element in the iterator - */ - public T element() { - T x = next(); - pushback(x); - return x; - } - - /** - * @return the next element in the iteration. - */ - public T next() { - if (pushedElement != null) { - final T ret = pushedElement; - pushedElement = null; - return ret; - } else { - return underlyingIterator.next(); - } - } - - public void pushback(T elt) { - assert(pushedElement == null); - - pushedElement = elt; - } - - public void remove() { - throw new UnsupportedOperationException(); - } - - public Iterator getUnderlyingIterator() { - return underlyingIterator; - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/ReadFormattingIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/ReadFormattingIterator.java deleted file mode 100644 index 492227932..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/ReadFormattingIterator.java +++ /dev/null @@ -1,140 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.iterators; - -import htsjdk.samtools.SAMRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.sam.AlignmentUtils; - -/** - * An iterator which does post-processing of a read, including potentially wrapping - * the read in something with a compatible interface or replacing the read entirely. - * - * @author mhanna - * @version 0.1 - */ -public class ReadFormattingIterator implements GATKSAMIterator { - /** - * Logger. - */ - final protected static Logger logger = Logger.getLogger(ReadFormattingIterator.class); - - /** - * Iterator to which to pass - */ - private GATKSAMIterator wrappedIterator; - - /** - * True if original base qualities should be used. - */ - private final boolean useOriginalBaseQualities; - - /** - * Positive if there is a default Base Quality value to fill in the reads with. - */ - private final byte defaultBaseQualities; - - - /** - * Decorate the given iterator inside a ReadWrappingIterator. - * @param wrappedIterator iterator - * @param useOriginalBaseQualities true if original base qualities should be used - * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. - */ - public ReadFormattingIterator(GATKSAMIterator wrappedIterator, boolean useOriginalBaseQualities, byte defaultBaseQualities) { - this.wrappedIterator = wrappedIterator; - this.useOriginalBaseQualities = useOriginalBaseQualities; - this.defaultBaseQualities = defaultBaseQualities; - - } - - /** - * Convenience function for use in foreach loops. Dangerous because it does not actually - * reset the iterator. - * @return An iterator through the current data stream. - */ - public GATKSAMIterator iterator() { - // NOTE: this iterator doesn't perform any kind of reset operation; it just returns itself. - // can we do something better? Do we really have to provide support for the Iterable interface? - return this; - } - - /** - * Close this iterator. - */ - public void close() { - wrappedIterator.close(); - } - - /** - * Does the iterator contain more values? - * @return True if there are more left to return, false otherwise. - */ - public boolean hasNext() { - return wrappedIterator.hasNext(); - } - - /** - * Get the next value in the sequence. - * @return Next value in the sequence. By convention, a NoSuchElementException should be thrown if - * no next exists. - */ - public SAMRecord next() { - SAMRecord rec = wrappedIterator.next(); - - // Always consolidate the cigar string into canonical form, collapsing zero-length / repeated cigar elements. - // Downstream code (like LocusIteratorByState) cannot necessarily handle non-consolidated cigar strings. - rec.setCigar(AlignmentUtils.consolidateCigar(rec.getCigar())); - - // if we are using default quals, check if we need them, and add if necessary. - // 1. we need if reads are lacking or have incomplete quality scores - // 2. we add if defaultBaseQualities has a positive value - if (defaultBaseQualities >= 0) { - byte reads [] = rec.getReadBases(); - byte quals [] = rec.getBaseQualities(); - if (quals == null || quals.length < reads.length) { - byte new_quals [] = new byte [reads.length]; - for (int i=0; i cur.getReferenceIndex()) || - (last.getReferenceIndex().equals(cur.getReferenceIndex()) && - last.getAlignmentStart() > cur.getAlignmentStart()); - } - } - - public void remove() { - throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); - } - - public void close() { - it.close(); - } - - public Iterator iterator() { - return this; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReport.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReport.java deleted file mode 100644 index f8126b9a9..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReport.java +++ /dev/null @@ -1,786 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.phonehome; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.crypt.CryptUtils; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.io.IOUtils; -import org.broadinstitute.gatk.utils.io.Resource; -import org.broadinstitute.gatk.utils.threading.ThreadEfficiencyMonitor; -import org.jets3t.service.S3Service; -import org.jets3t.service.S3ServiceException; -import org.jets3t.service.impl.rest.httpclient.RestS3Service; -import org.jets3t.service.model.S3Object; -import org.jets3t.service.security.AWSCredentials; -import org.simpleframework.xml.Element; -import org.simpleframework.xml.Serializer; -import org.simpleframework.xml.core.Persister; - -import java.io.*; -import java.security.NoSuchAlgorithmException; -import java.security.PublicKey; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.Arrays; -import java.util.Date; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; - - -/** - * A detailed description of a GATK run, and error if applicable. Simply create a GATKRunReport - * with the constructor, providing the walker that was run and the fully instantiated GenomeAnalysisEngine - * after the run finishes and the GATKRunReport will collect all of the report information - * into this object. Call postReport to write out the report, as an XML document, to either STDOUT, - * a file (in which case the output is gzipped), or with no arguments the report will be posted to the - * GATK run report database. - * - * @author depristo - * @since 2010 - */ -public class GATKRunReport { - protected static final String REPORT_BUCKET_NAME = "broad.gsa.gatk.run.reports"; - protected static final String TEST_REPORT_BUCKET_NAME = "broad.gsa.gatk.run.reports.test"; - protected final static String AWS_ACCESS_KEY_MD5 = "34d4a26eb2062b3f06e833b28f9a38c6"; - protected final static String AWS_SECRET_KEY_MD5 = "83f2332eec99ef1d7425d5dc5d4b514a"; - - private static final DateFormat DATE_FORMAT = new SimpleDateFormat("yyyy/MM/dd HH.mm.ss"); - - /** - * our log - */ - protected static final Logger logger = Logger.getLogger(GATKRunReport.class); - - /** - * Default value for the number of milliseconds before an S3 put operation is timed-out. - * Can be overridden via a constructor argument. - */ - private static final long S3_DEFAULT_PUT_TIME_OUT_IN_MILLISECONDS = 30 * 1000; - - /** - * Number of milliseconds before an S3 put operation is timed-out. - */ - private long s3PutTimeOutInMilliseconds = S3_DEFAULT_PUT_TIME_OUT_IN_MILLISECONDS; - - // ----------------------------------------------------------------- - // elements captured for the report - // ----------------------------------------------------------------- - - @Element(required = false, name = "id") - private String id; - - @Element(required = false, name = "exception") - private GATKRunReportException mException; - - @Element(required = true, name = "start-time") - private String startTime = "ND"; - - @Element(required = true, name = "end-time") - private String endTime; - - @Element(required = true, name = "run-time") - private long runTime = 0; - - @Element(required = true, name = "walker-name") - private String walkerName; - - @Element(required = true, name = "svn-version") - private String svnVersion; - - @Element(required = true, name = "total-memory") - private long totalMemory; - - @Element(required = true, name = "max-memory") - private long maxMemory; - - @Element(required = true, name = "user-name") - private String userName; - - @Element(required = true, name = "host-name") - private String hostName; - - @Element(required = true, name = "java") - private String javaVersion; - - @Element(required = true, name = "machine") - private String machine; - - @Element(required = true, name = "iterations") - private long nIterations; - - @Element(required = true, name = "tag") - private String tag; - - @Element(required = true, name = "num-threads") - private int numThreads; - @Element(required = true, name = "percent-time-running") - private String percentTimeRunning; - @Element(required = true, name = "percent-time-waiting") - private String percentTimeWaiting; - @Element(required = true, name = "percent-time-blocking") - private String percentTimeBlocking; - @Element(required = true, name = "percent-time-waiting-for-io") - private String percentTimeWaitingForIO; - - /** The error message, if one occurred, or null if none did */ - public String errorMessage = null; - /** The error that occurred, if one did, or null if none did */ - public Throwable errorThrown = null; - - /** - * How should the GATK report its usage? - */ - public enum PhoneHomeOption { - /** Disable phone home */ - NO_ET, - /** Forces the report to go to S3 */ - AWS, - /** Force output to STDOUT. For debugging only */ - STDOUT - } - - /** - * To allow us to deserial reports from XML - */ - private GATKRunReport() { } - - /** - * Read a GATKRunReport from the serialized XML representation in String reportAsXML - * @param stream an input stream containing a serialized XML report - * @return a reconstituted GATKRunReport from reportAsXML - * @throws Exception if parsing fails for any reason - */ - @Ensures("result != null") - protected static GATKRunReport deserializeReport(final InputStream stream) throws Exception { - final Serializer serializer = new Persister(); - return serializer.read(GATKRunReport.class, stream); - } - - /** - * Create a new GATKRunReport from a report on S3 - * - * Assumes that s3Object has already been written to S3, and this function merely - * fetches it from S3 and deserializes it. The access keys must have permission to - * GetObject from S3. - * - * @param downloaderAccessKey AWS access key with permission to GetObject from bucketName - * @param downloaderSecretKey AWS secret key with permission to GetObject from bucketName - * @param bucketName the name of the bucket holding the report - * @param s3Object the s3Object we wrote to S3 in bucketName that we want to get back and decode - * @return a deserialized report derived from s3://bucketName/s3Object.getName() - * @throws Exception - */ - @Ensures("result != null") - protected static GATKRunReport deserializeReport(final String downloaderAccessKey, - final String downloaderSecretKey, - final String bucketName, - final S3Object s3Object) throws Exception { - final S3Service s3Service = initializeAWSService(downloaderAccessKey, downloaderSecretKey); - - // Retrieve the whole data object we created previously - final S3Object objectComplete = s3Service.getObject(bucketName, s3Object.getName()); - - // Read the data from the object's DataInputStream using a loop, and print it out. - return deserializeReport(new GZIPInputStream(objectComplete.getDataInputStream())); - } - - /** - * Create a new RunReport and population all of the fields with values from the walker and engine. - * Allows the S3 put timeout to be explicitly set. - * - * @param walker the GATK walker that we ran - * @param e the exception caused by running this walker, or null if we completed successfully - * @param engine the GAE we used to run the walker, so we can fetch runtime, args, etc - * @param type the GATK phone home setting - * @param s3PutTimeOutInMilliseconds number of milliseconds to wait before timing out an S3 put operation - */ - public GATKRunReport(final Walker walker, final Exception e, final GenomeAnalysisEngine engine, final PhoneHomeOption type, - final long s3PutTimeOutInMilliseconds) { - this(walker, e, engine, type); - this.s3PutTimeOutInMilliseconds = s3PutTimeOutInMilliseconds; - } - - /** - * Create a new RunReport and population all of the fields with values from the walker and engine. - * Leaves the S3 put timeout set to the default value of S3_DEFAULT_PUT_TIME_OUT_IN_MILLISECONDS. - * - * @param walker the GATK walker that we ran - * @param e the exception caused by running this walker, or null if we completed successfully - * @param engine the GAE we used to run the walker, so we can fetch runtime, args, etc - * @param type the GATK phone home setting - */ - public GATKRunReport(final Walker walker, final Exception e, final GenomeAnalysisEngine engine, final PhoneHomeOption type) { - if ( type == PhoneHomeOption.NO_ET ) - throw new ReviewedGATKException("Trying to create a run report when type is NO_ET!"); - - logger.debug("Aggregating data for run report"); - - // what did we run? - id = org.apache.commons.lang.RandomStringUtils.randomAlphanumeric(32); - walkerName = engine.getWalkerName(walker.getClass()); - svnVersion = CommandLineGATK.getVersionNumber(); - - // runtime performance metrics - Date end = new java.util.Date(); - endTime = DATE_FORMAT.format(end); - if ( engine.getStartTime() != null ) { // made it this far during initialization - startTime = DATE_FORMAT.format(engine.getStartTime()); - runTime = (end.getTime() - engine.getStartTime().getTime()) / 1000L; // difference in seconds - } - - // deal with memory usage - Runtime.getRuntime().gc(); // call GC so totalMemory is ~ used memory - maxMemory = Runtime.getRuntime().maxMemory(); - totalMemory = Runtime.getRuntime().totalMemory(); - - // we can only do some operations if an error hasn't occurred - if ( engine.getCumulativeMetrics() != null ) { - // it's possible we aborted so early that these data structures arent initialized - nIterations = engine.getCumulativeMetrics().getNumIterations(); - } - - tag = engine.getArguments().tag; - - // user and hostname -- information about the runner of the GATK - userName = System.getProperty("user.name"); - hostName = Utils.resolveHostname(); - - // basic java information - javaVersion = Utils.join("-", Arrays.asList(System.getProperty("java.vendor"), System.getProperty("java.version"))); - machine = Utils.join("-", Arrays.asList(System.getProperty("os.name"), System.getProperty("os.arch"))); - - // if there was an exception, capture it - this.mException = e == null ? null : new GATKRunReportException(e); - - numThreads = engine.getTotalNumberOfThreads(); - percentTimeRunning = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.USER_CPU); - percentTimeBlocking = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.BLOCKING); - percentTimeWaiting = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.WAITING); - percentTimeWaitingForIO = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.WAITING_FOR_IO); - } - - /** - * Get the random alpha-numeric ID of this GATKRunReport - * @return a non-null string ID - */ - @Ensures("result != null") - public String getID() { - return id; - } - - /** - * Return a string representing the percent of time the GATK spent in state, if possible. Otherwise return NA - * - * @param engine the GATK engine whose threading efficiency info we will use - * @param state the state whose occupancy we wish to know - * @return a string representation of the percent occupancy of state, or NA is not possible - */ - @Requires({"engine != null", "state != null"}) - @Ensures("result != null") - private String getThreadEfficiencyPercent(final GenomeAnalysisEngine engine, final ThreadEfficiencyMonitor.State state) { - final ThreadEfficiencyMonitor tem = engine.getThreadEfficiencyMonitor(); - return tem == null ? "NA" : String.format("%.2f", tem.getStatePercent(state)); - } - - /** - * Get a filename (no path) appropriate for this report - * - * @return a non-null string filename - */ - @Ensures("result != null") - protected String getReportFileName() { - return getID() + ".report.xml.gz"; - } - - // --------------------------------------------------------------------------- - // - // Main public interface method for posting reports - // - // --------------------------------------------------------------------------- - - /** - * Post this GATK report to the destination implied by the PhoneHomeOption type - * - * Guaranteed to never throw an exception (exception noted below) and to return - * with a reasonable (~10 seconds) time regardless of successful writing of the report. - * - * @throws IllegalArgumentException if type == null - * @param type the type of phoning home we want to do - * @return true if a report was successfully written, false otherwise - */ - public boolean postReport(final PhoneHomeOption type) { - if ( type == null ) throw new IllegalArgumentException("type cannot be null"); - - logger.debug("Posting report of type " + type); - switch (type) { - case NO_ET: // don't do anything - return false; - case AWS: - wentToAWS = true; - return postReportToAWSS3() != null; - case STDOUT: - return postReportToStream(System.out); - default: - exceptDuringRunReport("BUG: unexpected PhoneHomeOption "); - return false; - } - } - - // --------------------------------------------------------------------------- - // - // Code for sending reports to local files - // - // --------------------------------------------------------------------------- - - /** - * Write an XML representation of this report to the stream, throwing a GATKException if the marshalling - * fails for any reason. - * - * @param stream an output stream to write the report to - */ - @Requires("stream != null") - protected boolean postReportToStream(final OutputStream stream) { - final Serializer serializer = new Persister(); - try { - serializer.write(this, stream); - return true; - } catch (Exception e) { - return false; - } - } - - // --------------------------------------------------------------------------- - // - // Code for sending reports to s3 - // - // --------------------------------------------------------------------------- - - /** - * Get the name of the S3 bucket where we should upload this report - * - * @return the string name of the s3 bucket - */ - @Ensures("result != null") - protected String getS3ReportBucket() { - return s3ReportBucket; - } - - /** - * Decrypts encrypted AWS key from encryptedKeySource - * @param encryptedKeySource a file containing an encrypted AWS key - * @return a decrypted AWS key as a String - */ - @Ensures("result != null") - public static String decryptAWSKey(final File encryptedKeySource) throws FileNotFoundException { - if ( encryptedKeySource == null ) throw new IllegalArgumentException("encryptedKeySource cannot be null"); - return decryptAWSKey(new FileInputStream(encryptedKeySource)); - } - - /** - * @see #decryptAWSKey(java.io.File) but with input from an inputstream - */ - @Requires("encryptedKeySource != null") - @Ensures("result != null") - private static String decryptAWSKey(final InputStream encryptedKeySource) { - final PublicKey key = CryptUtils.loadGATKDistributedPublicKey(); - final byte[] fromDisk = IOUtils.readStreamIntoByteArray(encryptedKeySource); - final byte[] decrypted = CryptUtils.decryptData(fromDisk, key); - return new String(decrypted); - } - - /** - * Get the decrypted AWS key sorted in the resource directories of name - * @param name the name of the file containing the needed AWS key - * @return a non-null GATK - */ - @Requires("name != null") - @Ensures("result != null") - private static String getAWSKey(final String name) { - final Resource resource = new Resource(name, GATKRunReport.class); - return decryptAWSKey(resource.getResourceContentsAsStream()); - } - - /** - * Get the AWS access key for the GATK user - * @return a non-null AWS access key for the GATK user - */ - @Ensures("result != null") - protected static String getAWSUploadAccessKey() { - return getAWSKey("resources/GATK_AWS_access.key"); - } - - /** - * Get the AWS secret key for the GATK user - * @return a non-null AWS secret key for the GATK user - */ - @Ensures("result != null") - protected static String getAWSUploadSecretKey() { - return getAWSKey("resources/GATK_AWS_secret.key"); - } - - /** - * Check that the AWS keys can be decrypted and are what we expect them to be - * - * @throws ReviewedGATKException if anything goes wrong - */ - public static void checkAWSAreValid() { - try { - final String accessKeyMD5 = Utils.calcMD5(getAWSUploadAccessKey()); - final String secretKeyMD5 = Utils.calcMD5(getAWSUploadSecretKey()); - - if ( ! AWS_ACCESS_KEY_MD5.equals(accessKeyMD5) ) { - throw new ReviewedGATKException("Invalid AWS access key found, expected MD5 " + AWS_ACCESS_KEY_MD5 + " but got " + accessKeyMD5); - } - if ( ! AWS_SECRET_KEY_MD5.equals(secretKeyMD5) ) { - throw new ReviewedGATKException("Invalid AWS secret key found, expected MD5 " + AWS_SECRET_KEY_MD5 + " but got " + secretKeyMD5); - } - - } catch ( Exception e ) { - throw new ReviewedGATKException("Couldn't decrypt AWS keys, something is wrong with the GATK distribution"); - } - } - - /** - * Get an initialized S3Service for use in communicating with AWS/s3 - * - * @param awsAccessKey our AWS access key to use - * @param awsSecretKey our AWS secret key to use - * @return an initialized S3Service object that can be immediately used to interact with S3 - * @throws S3ServiceException - */ - @Requires({"awsAccessKey != null", "awsSecretKey != null"}) - @Ensures("result != null") - protected static S3Service initializeAWSService(final String awsAccessKey, final String awsSecretKey) throws S3ServiceException { - // To communicate with S3, create a class that implements an S3Service. We will use the REST/HTTP - // implementation based on HttpClient, as this is the most robust implementation provided with JetS3t. - final AWSCredentials awsCredentials = new AWSCredentials(awsAccessKey, awsSecretKey); - return new RestS3Service(awsCredentials); - } - - /** - * A runnable that pushes this GATKReport up to s3. - * - * Should be run in a separate thread so we can time it out if something is taking too long - */ - private class S3PutRunnable implements Runnable { - /** Was the upload operation successful? */ - public final AtomicBoolean isSuccess; - /** The name of this report */ - private final String filename; - /** The contents of this report */ - private final byte[] contents; - - /** The s3Object that we created to upload, or null if it failed */ - public S3Object s3Object = null; - - @Requires({"filename != null", "contents != null"}) - public S3PutRunnable(final String filename, final byte[] contents){ - this.isSuccess = new AtomicBoolean(); - this.filename = filename; - this.contents = contents; - } - - public void run() { - try { - switch ( awsMode ) { - case FAIL_WITH_EXCEPTION: - throw new IllegalStateException("We are throwing an exception for testing purposes"); - case TIMEOUT: - try { - Thread.sleep(s3PutTimeOutInMilliseconds * 100); - } catch ( InterruptedException e ) { - // supposed to be empty - } - break; - case NORMAL: - // IAM GATK user credentials -- only right is to PutObject into broad.gsa.gatk.run.reports bucket - final S3Service s3Service = initializeAWSService(getAWSUploadAccessKey(), getAWSUploadSecretKey()); - - // Create an S3Object based on a file, with Content-Length set automatically and - // Content-Type set based on the file's extension (using the Mimetypes utility class) - final S3Object fileObject = new S3Object(filename, contents); - //logger.info("Created S3Object" + fileObject); - //logger.info("Uploading " + localFile + " to AWS bucket"); - s3Object = s3Service.putObject(getS3ReportBucket(), fileObject); - isSuccess.set(true); - break; - default: - throw new IllegalStateException("Unexpected AWS exception"); - } - } catch ( S3ServiceException e ) { - exceptDuringRunReport("S3 exception occurred", e); - } catch ( NoSuchAlgorithmException e ) { - exceptDuringRunReport("Couldn't calculate MD5", e); - } catch ( IOException e ) { - exceptDuringRunReport("Couldn't read report file", e); - } catch ( Exception e ) { - exceptDuringRunReport("An unexpected exception occurred during posting", e); - } - } - } - - /** - * Post this GATK report to the AWS s3 GATK_Run_Report log - * - * @return the s3Object pointing to our pushed report, or null if we failed to push - */ - protected S3Object postReportToAWSS3() { - // modifying example code from http://jets3t.s3.amazonaws.com/toolkit/code-samples.html - this.hostName = Utils.resolveHostname(); // we want to fill in the host name - final String key = getReportFileName(); - logger.debug("Generating GATK report to AWS S3 with key " + key); - - try { - // create an byte output stream so we can capture the output as a byte[] - final ByteArrayOutputStream byteStream = new ByteArrayOutputStream(8096); - final OutputStream outputStream = new GZIPOutputStream(byteStream); - postReportToStream(outputStream); - outputStream.close(); - final byte[] report = byteStream.toByteArray(); - - // stop us from printing the annoying, and meaningless, mime types warning - final Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class); - mimeTypeLogger.setLevel(Level.FATAL); - - // Set the S3 upload on its own thread with timeout: - final S3PutRunnable s3run = new S3PutRunnable(key,report); - final Thread s3thread = new Thread(s3run); - s3thread.setDaemon(true); - s3thread.setName("S3Put-Thread"); - s3thread.start(); - - s3thread.join(s3PutTimeOutInMilliseconds); - - if(s3thread.isAlive()){ - s3thread.interrupt(); - exceptDuringRunReport("Run statistics report upload to AWS S3 timed-out"); - } else if(s3run.isSuccess.get()) { - logger.info("Uploaded run statistics report to AWS S3"); - logger.debug("Uploaded to AWS: " + s3run.s3Object); - return s3run.s3Object; - } else { - // an exception occurred, the thread should have already invoked the exceptDuringRunReport function - } - } catch ( IOException e ) { - exceptDuringRunReport("Couldn't read report file", e); - } catch ( InterruptedException e) { - exceptDuringRunReport("Run statistics report upload interrupted", e); - } - - return null; - } - - // --------------------------------------------------------------------------- - // - // Error handling code - // - // --------------------------------------------------------------------------- - - /** - * Note that an exception occurred during creating or writing this report - * @param msg the message to print - * @param e the exception that occurred - */ - @Ensures("exceptionOccurredDuringPost()") - private void exceptDuringRunReport(final String msg, final Throwable e) { - this.errorMessage = msg; - this.errorThrown = e; - logger.debug("A problem occurred during GATK run reporting [*** everything is fine, but no report could be generated; please do not post this to the support forum ***]. Message is: " + msg + ". Error message is: " + e.getMessage()); - } - - /** - * Note that an exception occurred during creating or writing this report - * @param msg the message to print - */ - @Ensures("exceptionOccurredDuringPost()") - private void exceptDuringRunReport(final String msg) { - this.errorMessage = msg; - logger.debug("A problem occurred during GATK run reporting [*** everything is fine, but no report could be generated; please do not post this to the support forum ***]. Message is " + msg); - } - - /** - * Did an error occur during the posting of this run report? - * @return true if so, false if not - */ - public boolean exceptionOccurredDuringPost() { - return getErrorMessage() != null; - } - - /** - * If an error occurred during posting of this report, retrieve the message of the error that occurred, or null if - * no error occurred - * @return a string describing the error that occurred, or null if none did - */ - public String getErrorMessage() { - return errorMessage; - } - - /** - * Get the throwable that caused the exception during posting of this message, or null if none was available - * - * Note that getting a null valuable from this function doesn't not imply that no error occurred. Some - * errors that occurred many not have generated a throwable. - * - * @return the Throwable that caused the error, or null if no error occurred or was not caused by a throwable - */ - public Throwable getErrorThrown() { - return errorThrown; - } - - /** - * Helper method to format the exception that occurred during posting, or a string saying none occurred - * @return a non-null string - */ - @Ensures("result != null") - protected String formatError() { - return exceptionOccurredDuringPost() - ? String.format("Exception message=%s with cause=%s", getErrorMessage(), getErrorThrown()) - : "No exception occurred"; - } - - // --------------------------------------------------------------------------- - // - // Equals and hashcode -- purely for comparing reports for testing - // - // --------------------------------------------------------------------------- - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - GATKRunReport that = (GATKRunReport) o; - - if (maxMemory != that.maxMemory) return false; - if (nIterations != that.nIterations) return false; - if (numThreads != that.numThreads) return false; - if (runTime != that.runTime) return false; - if (totalMemory != that.totalMemory) return false; - if (endTime != null ? !endTime.equals(that.endTime) : that.endTime != null) return false; - if (hostName != null ? !hostName.equals(that.hostName) : that.hostName != null) return false; - if (id != null ? !id.equals(that.id) : that.id != null) return false; - if (javaVersion != null ? !javaVersion.equals(that.javaVersion) : that.javaVersion != null) return false; - if (mException != null ? !mException.equals(that.mException) : that.mException != null) return false; - if (machine != null ? !machine.equals(that.machine) : that.machine != null) return false; - if (percentTimeBlocking != null ? !percentTimeBlocking.equals(that.percentTimeBlocking) : that.percentTimeBlocking != null) - return false; - if (percentTimeRunning != null ? !percentTimeRunning.equals(that.percentTimeRunning) : that.percentTimeRunning != null) - return false; - if (percentTimeWaiting != null ? !percentTimeWaiting.equals(that.percentTimeWaiting) : that.percentTimeWaiting != null) - return false; - if (percentTimeWaitingForIO != null ? !percentTimeWaitingForIO.equals(that.percentTimeWaitingForIO) : that.percentTimeWaitingForIO != null) - return false; - if (startTime != null ? !startTime.equals(that.startTime) : that.startTime != null) return false; - if (svnVersion != null ? !svnVersion.equals(that.svnVersion) : that.svnVersion != null) return false; - if (tag != null ? !tag.equals(that.tag) : that.tag != null) return false; - if (userName != null ? !userName.equals(that.userName) : that.userName != null) return false; - if (walkerName != null ? !walkerName.equals(that.walkerName) : that.walkerName != null) return false; - - return true; - } - - @Override - public int hashCode() { - int result = id != null ? id.hashCode() : 0; - result = 31 * result + (mException != null ? mException.hashCode() : 0); - result = 31 * result + (startTime != null ? startTime.hashCode() : 0); - result = 31 * result + (endTime != null ? endTime.hashCode() : 0); - result = 31 * result + (int) (runTime ^ (runTime >>> 32)); - result = 31 * result + (walkerName != null ? walkerName.hashCode() : 0); - result = 31 * result + (svnVersion != null ? svnVersion.hashCode() : 0); - result = 31 * result + (int) (totalMemory ^ (totalMemory >>> 32)); - result = 31 * result + (int) (maxMemory ^ (maxMemory >>> 32)); - result = 31 * result + (userName != null ? userName.hashCode() : 0); - result = 31 * result + (hostName != null ? hostName.hashCode() : 0); - result = 31 * result + (javaVersion != null ? javaVersion.hashCode() : 0); - result = 31 * result + (machine != null ? machine.hashCode() : 0); - result = 31 * result + (int) (nIterations ^ (nIterations >>> 32)); - result = 31 * result + (tag != null ? tag.hashCode() : 0); - result = 31 * result + numThreads; - result = 31 * result + (percentTimeRunning != null ? percentTimeRunning.hashCode() : 0); - result = 31 * result + (percentTimeWaiting != null ? percentTimeWaiting.hashCode() : 0); - result = 31 * result + (percentTimeBlocking != null ? percentTimeBlocking.hashCode() : 0); - result = 31 * result + (percentTimeWaitingForIO != null ? percentTimeWaitingForIO.hashCode() : 0); - return result; - } - - // --------------------------------------------------------------------------- - // - // Code specifically for testing the GATKRunReport - // - // --------------------------------------------------------------------------- - - /** - * Enum specifying how the S3 uploader should behave. Must be normal by default. Purely for testing purposes - */ - protected enum AWSMode { - NORMAL, // write normally to AWS - FAIL_WITH_EXCEPTION, // artificially fail during writing - TIMEOUT // sleep, so we time out - } - /** Our AWS mode */ - private AWSMode awsMode = AWSMode.NORMAL; - /** The bucket were we send the GATK report on AWS/s3 */ - private String s3ReportBucket = REPORT_BUCKET_NAME; - /** Did we send the report to AWS? */ - private boolean wentToAWS = false; - - /** - * Send the report to the AWS test bucket -- for testing only - */ - protected void sendAWSToTestBucket() { - s3ReportBucket = TEST_REPORT_BUCKET_NAME; - } - - /** - * Has the report been written to AWS? - * - * Does not imply anything about the success of the send, just that it was attempted - * - * @return true if the report has been sent to AWS, false otherwise - */ - protected boolean wentToAWS() { - return wentToAWS; - } - - /** - * Purely for testing purposes. Tells the AWS uploader whether to actually upload or simulate errors - * @param mode what we want to do - */ - @Requires("mode != null") - protected void setAwsMode(final AWSMode mode) { - this.awsMode = mode; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/RODRecordListImpl.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/RODRecordListImpl.java deleted file mode 100644 index 7296c39ae..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/RODRecordListImpl.java +++ /dev/null @@ -1,129 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata; - -import org.broadinstitute.gatk.engine.refdata.utils.GATKFeature; -import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.HasGenomeLocation; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Sep 10, 2009 - * Time: 6:10:48 PM - * To change this template use File | Settings | File Templates. - */ -public class RODRecordListImpl extends AbstractList implements Comparable, Cloneable, RODRecordList, HasGenomeLocation { - private List records; - private GenomeLoc location = null; - private String name = null; - - public RODRecordListImpl(String name) { - records = new ArrayList(); - this.name = name; - } - - /** - * Fully qualified constructor: instantiates a new GATKFeatureRecordList object with specified GATKFeature track name, location on the - * reference, and list of associated GATKFeatures. This is a knee-deep COPY constructor: passed name, loc, and data element - * objects will be referenced from the created GATKFeatureRecordList (so that changing them from outside will affect data - * in this object), however, the data elements will be copied into a newly - * allocated list, so that the 'data' collection argument can be modified afterwards without affecting the state - * of this record list. WARNING: this constructor is (semi-)validating: passed name and location - * are allowed to be nulls (although it maybe unsafe, use caution), but if they are not nulls, then passed non-null GATKFeature data - * elements must have same track name, and their locations must overlap with the passed 'location' argument. Null - * data elements or null 'data' collection argument are allowed as well. - * @param name the name of the track - * @param data the collection of features at this location - * @param loc the location - */ - public RODRecordListImpl(String name, Collection data, GenomeLoc loc) { - this.records = new ArrayList(data==null?0:data.size()); - this.name = name; - this.location = loc; - if ( data == null || data.size() == 0 ) return; // empty dataset, nothing to do - for ( GATKFeature r : data ) { - records.add(r); - if ( r == null ) continue; - if ( ! this.name.equals(r.getName() ) ) { - throw new ReviewedGATKException("Attempt to add GATKFeature with non-matching name "+r.getName()+" to the track "+name); - } - if ( location != null && ! location.overlapsP(r.getLocation()) ) { - throw new ReviewedGATKException("Attempt to add GATKFeature that lies outside of specified interval "+location+"; offending GATKFeature:\n"+r.toString()); - } - } - } - - - public GenomeLoc getLocation() { return location; } - public String getName() { return name; } - public Iterator iterator() { return records.iterator() ; } - public void clear() { records.clear(); } - public boolean isEmpty() { return records.isEmpty(); } - - public boolean add(GATKFeature record) { add(record, false); return true;} - - @Override - public GATKFeature get(int i) { - return records.get(i); - } - - public void add(GATKFeature record, boolean allowNameMismatch) { - if ( record != null ) { - if ( ! allowNameMismatch && ! name.equals(record.getName() ) ) - throw new ReviewedGATKException("Attempt to add GATKFeature with non-matching name "+record.getName()+" to the track "+name); - } - records.add(record); - } - - public void add(RODRecordList records ) { add( records, false ); } - - public void add(RODRecordList records, boolean allowNameMismatch) { - for ( GATKFeature record : records ) - add(record, allowNameMismatch); - } - - public int size() { return records.size() ; } - - /** - * Compares this object with the specified object for order. Returns a - * negative integer, zero, or a positive integer as this object is less - * than, equal to, or greater than the specified object. - * - * @param that the object to be compared. - * @return a negative integer, zero, or a positive integer as this object - * is less than, equal to, or greater than the specified object. - * @throws ClassCastException if the specified object's type prevents it - * from being compared to this object. - */ - public int compareTo(RODRecordList that) { - return getLocation().compareTo(that.getLocation()); //To change body of implemented methods use File | Settings | File Templates. - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/RefMetaDataTracker.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/RefMetaDataTracker.java deleted file mode 100644 index 7ccf6e572..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/RefMetaDataTracker.java +++ /dev/null @@ -1,497 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import htsjdk.tribble.Feature; -import org.broadinstitute.gatk.utils.commandline.RodBinding; -import org.broadinstitute.gatk.engine.refdata.utils.GATKFeature; -import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.util.*; - -/** - * This class represents the Reference Metadata available at a particular site in the genome. It can be - * used to conveniently lookup the RMDs at this site, as well just getting a list of all of the RMDs - * - * The standard interaction model is: - * - * Traversal system arrives at a site, which has a bunch of RMDs covering it - * Traversal passes creates a tracker and passes it to the walker - * walker calls get(rodBinding) to obtain the RMDs values at this site for the track - * associated with rodBinding. - * - * Note that this is an immutable class. Once created the underlying data structures - * cannot be modified - * - * User: mdepristo - * Date: Apr 3, 2009 - * Time: 3:05:23 PM - */ -public class RefMetaDataTracker { - // TODO: this should be a list, not a bindings, actually - private final static RODRecordList EMPTY_ROD_RECORD_LIST = new RODRecordListImpl("EMPTY"); - - final Map bindings; - final protected static Logger logger = Logger.getLogger(RefMetaDataTracker.class); - public final static RefMetaDataTracker EMPTY_TRACKER = new RefMetaDataTracker(); - - // ------------------------------------------------------------------------------------------ - // - // - // Special ENGINE interaction functions - // - // - // ------------------------------------------------------------------------------------------ - - /** - * Create an tracker with no bindings - */ - public RefMetaDataTracker() { - bindings = Collections.emptyMap(); - } - - public RefMetaDataTracker(final Collection allBindings) { - // set up the bindings - if ( allBindings.isEmpty() ) - bindings = Collections.emptyMap(); - else { - final Map tmap = new HashMap(allBindings.size()); - for ( RODRecordList rod : allBindings ) { - if ( rod != null && ! rod.isEmpty() ) - tmap.put(canonicalName(rod.getName()), rod); - } - - // ensure that no one modifies the bindings itself - bindings = Collections.unmodifiableMap(tmap); - } - } - - // ------------------------------------------------------------------------------------------ - // - // - // Generic accessors - // - // - // ------------------------------------------------------------------------------------------ - - /** - * Gets all of the Tribble features spanning this locus, returning them as a list of specific - * type T extending Feature. This function looks across all tracks to find the Features, so - * if you have two tracks A and B each containing 1 Feature, then getValues will return - * a list containing both features. - * - * Note that this function assumes that all of the bound features are instances of or - * subclasses of T. A ClassCastException will occur if this isn't the case. If you want - * to get all Features without any danger of such an exception use the root Tribble - * interface Feature. - * - * @param type The type of the underlying objects bound here - * @param as above - * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. - */ - @Requires({"type != null"}) - @Ensures("result != null") - public List getValues(final Class type) { - return addValues(bindings.keySet(), type, new ArrayList(), null, false, false); - } - - /** - * Provides the same functionality as @link #getValues(Class) but will only include - * Features that start as the GenomeLoc provide onlyAtThisLoc. - * - * @param type The type of the underlying objects bound here - * @param onlyAtThisLoc - * @param as above - * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. - */ - @Requires({"type != null", "onlyAtThisLoc != null"}) - @Ensures("result != null") - public List getValues(final Class type, final GenomeLoc onlyAtThisLoc) { - return addValues(bindings.keySet(), type, new ArrayList(), onlyAtThisLoc, true, false); - } - - /** - * Uses the same logic as @link #getValues(Class) but arbitrary select one of the resulting - * elements of the list to return. That is, if there would be two elements in the result of - * @link #getValues(Class), one of these two is selected, and which one it will be isn't - * specified. Consequently, this method is only really safe if (1) you absolutely know - * that only one binding will meet the constraints of @link #getValues(Class) or (2) - * you truly don't care which of the multiple bindings available you are going to examine. - * - * If there are no bindings here, getFirstValue() return null - * - * @param type The type of the underlying objects bound here - * @param as above - * @return A random single element the RODs bound here, or null if none are bound. - */ - @Requires({"type != null"}) - public T getFirstValue(final Class type) { - return safeGetFirst(getValues(type)); - } - - /** - * Uses the same logic as @link #getValue(Class,GenomeLoc) to determine the list - * of eligible Features and @link #getFirstValue(Class) to select a single - * element from the interval list. - * - * @param type The type of the underlying objects bound here - * @param as above - * @param onlyAtThisLoc only Features starting at this site are considered - * @return A random single element the RODs bound here starting at onlyAtThisLoc, or null if none are bound. - */ - @Requires({"type != null", "onlyAtThisLoc != null"}) - public T getFirstValue(final Class type, final GenomeLoc onlyAtThisLoc) { - return safeGetFirst(getValues(type, onlyAtThisLoc)); - } - - /** - * Same logic as @link #getFirstValue(RodBinding, boolean) but prioritizes records from prioritizeThisLoc if available - * - * @param rodBindings Only Features coming from the tracks associated with one of rodBindings are fetched - * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features - * @param prioritizeThisLoc only Features starting at this site are considered - * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. - */ - @Requires({"rodBindings != null", "prioritizeThisLoc != null"}) - @Ensures("result != null") - public List getPrioritizedValue(final Collection> rodBindings, final GenomeLoc prioritizeThisLoc) { - final List results = new ArrayList<>(); - - for ( final RodBinding rodBinding : rodBindings ) { - - // if there's a value at the prioritized location, take it - T value = getFirstValue(rodBinding, prioritizeThisLoc); - - // otherwise, grab any one - if ( value == null ) - value = getFirstValue(rodBinding); - - // add if not null - if ( value != null ) - results.add(value); - } - - return results; - } - - /** - * Gets all of the Tribble features bound to RodBinding spanning this locus, returning them as - * a list of specific type T extending Feature. - * - * Note that this function assumes that all of the bound features are instances of or - * subclasses of T. A ClassCastException will occur if this isn't the case. - * - * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched - * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features - * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. - */ - @Requires({"rodBinding != null"}) - @Ensures("result != null") - public List getValues(final RodBinding rodBinding) { - return addValues(rodBinding.getName(), rodBinding.getType(), new ArrayList(1), getTrackDataByName(rodBinding), null, false, false); - } - - /** - * Gets all of the Tribble features bound to any RodBinding in rodBindings, - * spanning this locus, returning them as a list of specific type T extending Feature. - * - * Note that this function assumes that all of the bound features are instances of or - * subclasses of T. A ClassCastException will occur if this isn't the case. - * - * @param rodBindings Only Features coming from the tracks associated with one of rodBindings are fetched - * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features - * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. - */ - @Requires({"rodBindings != null"}) - @Ensures("result != null") - public List getValues(final Collection> rodBindings) { - List results = new ArrayList(1); - for ( RodBinding rodBinding : rodBindings ) - results.addAll(getValues(rodBinding)); - return results; - } - - /** - * The same logic as @link #getValues(RodBinding) but enforces that each Feature start at onlyAtThisLoc - * - * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched - * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features - * @param onlyAtThisLoc only Features starting at this site are considered - * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. - */ - @Requires({"rodBinding != null", "onlyAtThisLoc != null"}) - @Ensures("result != null") - public List getValues(final RodBinding rodBinding, final GenomeLoc onlyAtThisLoc) { - return addValues(rodBinding.getName(), rodBinding.getType(), new ArrayList(1), getTrackDataByName(rodBinding), onlyAtThisLoc, true, false); - } - - /** - * The same logic as @link #getValues(List) but enforces that each Feature start at onlyAtThisLoc - * - * @param rodBindings Only Features coming from the tracks associated with one of rodBindings are fetched - * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features - * @param onlyAtThisLoc only Features starting at this site are considered - * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. - */ - @Requires({"rodBindings != null", "onlyAtThisLoc != null"}) - @Ensures("result != null") - public List getValues(final Collection> rodBindings, final GenomeLoc onlyAtThisLoc) { - List results = new ArrayList(1); - for ( RodBinding rodBinding : rodBindings ) - results.addAll(getValues(rodBinding, onlyAtThisLoc)); - return results; - } - - /** - * Uses the same logic as @getValues(RodBinding) to determine the list - * of eligible Features and select a single element from the resulting set - * of eligible features. - * - * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched - * @param as above - * @return A random single element the eligible Features found, or null if none are bound. - */ - @Requires({"rodBinding != null"}) - public T getFirstValue(final RodBinding rodBinding) { - return safeGetFirst(addValues(rodBinding.getName(), rodBinding.getType(), null, getTrackDataByName(rodBinding), null, false, true)); - } - - /** - * Uses the same logic as @getValues(RodBinding, GenomeLoc) to determine the list - * of eligible Features and select a single element from the resulting set - * of eligible features. - * - * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched - * @param as above - * @param onlyAtThisLoc only Features starting at this site are considered - * @return A random single element the eligible Features found, or null if none are bound. - */ - @Requires({"rodBinding != null", "onlyAtThisLoc != null"}) - public T getFirstValue(final RodBinding rodBinding, final GenomeLoc onlyAtThisLoc) { - return safeGetFirst(addValues(rodBinding.getName(), rodBinding.getType(), null, getTrackDataByName(rodBinding), onlyAtThisLoc, true, true)); - } - - /** - * Uses the same logic as @getValues(List) to determine the list - * of eligible Features and select a single element from the resulting set - * of eligible features. - * - * @param rodBindings Only Features coming from the tracks associated with these rodBindings are fetched - * @param as above - * @return A random single element the eligible Features found, or null if none are bound. - */ - @Requires({"rodBindings != null"}) - public T getFirstValue(final Collection> rodBindings) { - for ( RodBinding rodBinding : rodBindings ) { - T val = getFirstValue(rodBinding); - if ( val != null ) - return val; - } - return null; - } - - /** - * Uses the same logic as @getValues(RodBinding,GenomeLoc) to determine the list - * of eligible Features and select a single element from the resulting set - * of eligible features. - * - * @param rodBindings Only Features coming from the tracks associated with these rodBindings are fetched - * @param as above - * @param onlyAtThisLoc only Features starting at this site are considered - * @return A random single element the eligible Features found, or null if none are bound. - */ - @Requires({"rodBindings != null", "onlyAtThisLoc != null"}) - public T getFirstValue(final Collection> rodBindings, final GenomeLoc onlyAtThisLoc) { - for ( RodBinding rodBinding : rodBindings ) { - T val = getFirstValue(rodBinding, onlyAtThisLoc); - if ( val != null ) - return val; - } - return null; - } - - /** - * Is there a binding at this site to a ROD/track with the specified name? - * - * @param rodBinding the rod binding we want to know about - * @return true if any Features are bound in this tracker to rodBinding - */ - @Requires({"rodBinding != null"}) - public boolean hasValues(final RodBinding rodBinding) { - return bindings.containsKey(canonicalName(rodBinding.getName())); - } - - /** - * Get all of the RMD tracks at the current site. Each track is returned as a single compound - * object (RODRecordList) that may contain multiple RMD records associated with the current site. - * - * @return List of all tracks - */ - public List getBoundRodTracks() { - return new ArrayList(bindings.values()); - } - - /** - * The number of tracks with at least one value bound here - * @return the number of tracks with at least one bound Feature - */ - public int getNTracksWithBoundFeatures() { - return bindings.size(); - } - - // ------------------------------------------------------------------------------------------ - // Protected accessors using strings for unit testing - // ------------------------------------------------------------------------------------------ - - protected boolean hasValues(final String name) { - return bindings.containsKey(canonicalName(name)); - } - - protected List getValues(final Class type, final String name) { - return addValues(name, type, new ArrayList(), getTrackDataByName(name), null, false, false); - } - - protected List getValues(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { - return addValues(name, type, new ArrayList(), getTrackDataByName(name), onlyAtThisLoc, true, false); - } - - protected T getFirstValue(final Class type, final String name) { - return safeGetFirst(getValues(type, name)); - } - - protected T getFirstValue(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { - return safeGetFirst(getValues(type, name, onlyAtThisLoc)); - } - - // ------------------------------------------------------------------------------------------ - // - // - // Private utility functions - // - // - // ------------------------------------------------------------------------------------------ - - /** - * Helper function for getFirst() operations that takes a list of and - * returns the first element, or null if no such element exists. - * - * @param l - * @param - * @return - */ - @Requires({"l != null"}) - private T safeGetFirst(final List l) { - return l.isEmpty() ? null : l.get(0); - } - - private List addValues(final Collection names, - final Class type, - List values, - final GenomeLoc curLocation, - final boolean requireStartHere, - final boolean takeFirstOnly ) { - for ( String name : names ) { - RODRecordList rodList = getTrackDataByName(name); // require that the name is an exact match - values = addValues(name, type, values, rodList, curLocation, requireStartHere, takeFirstOnly ); - if ( takeFirstOnly && ! values.isEmpty() ) - break; - } - - return values; - } - - - - private List addValues(final String name, - final Class type, - List values, - final RODRecordList rodList, - final GenomeLoc curLocation, - final boolean requireStartHere, - final boolean takeFirstOnly ) { - for ( GATKFeature rec : rodList ) { - if ( ! requireStartHere || rec.getLocation().getStart() == curLocation.getStart() ) { // ok, we are going to keep this thing - Object obj = rec.getUnderlyingObject(); - if (!(type.isAssignableFrom(obj.getClass()))) - throw new UserException.CommandLineException("Unable to cast track named " + name + " to type of " + type.toString() - + " it's of type " + obj.getClass()); - - T objT = (T)obj; - if ( takeFirstOnly ) { - if ( values == null ) - values = Arrays.asList(objT); - else - values.add(objT); - - break; - } else { - if ( values == null ) - values = new ArrayList(); - values.add(objT); - } - } - } - - return values == null ? Collections.emptyList() : values; - } - - /** - * Finds the reference metadata track named 'name' and returns all ROD records from that track associated - * with the current site as a RODRecordList List object. If no data track with specified name is available, - * returns defaultValue wrapped as RODRecordList object. NOTE: if defaultValue is null, it will be wrapped up - * with track name set to 'name' and location set to null; otherwise the wrapper object will have name and - * location set to defaultValue.getID() and defaultValue.getLocation(), respectively (use caution, - * defaultValue.getLocation() may be not equal to what RODRecordList's location would be expected to be otherwise: - * for instance, on locus traversal, location is usually expected to be a single base we are currently looking at, - * regardless of the presence of "extended" RODs overlapping with that location). - * @param name track name - * @return track data for the given rod - */ - private RODRecordList getTrackDataByName(final String name) { - final String luName = canonicalName(name); - RODRecordList l = bindings.get(luName); - return l == null ? EMPTY_ROD_RECORD_LIST : l; - } - - private RODRecordList getTrackDataByName(final RodBinding binding) { - return getTrackDataByName(binding.getName()); - } - - /** - * Returns the canonical name of the rod name (lowercases it) - * @param name the name of the rod - * @return canonical name of the rod - */ - private String canonicalName(final String name) { - // todo -- remove me after switch to RodBinding syntax - return name.toLowerCase(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/ReferenceDependentFeatureCodec.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/ReferenceDependentFeatureCodec.java deleted file mode 100644 index 9bff00dd8..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/ReferenceDependentFeatureCodec.java +++ /dev/null @@ -1,43 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata; - -import htsjdk.tribble.FeatureCodec; -import org.broadinstitute.gatk.utils.GenomeLocParser; - -/** - * An interface marking that a given Tribble feature/codec is actually dependent on context within the - * reference, rather than having a dependency only on the contig, start, and stop of the given feature. - * A HACK. Tribble should contain all the information in needs to decode the unqualified position of - * a feature. - */ -public interface ReferenceDependentFeatureCodec { - /** - * Sets the appropriate GenomeLocParser, providing additional context when decoding larger and more variable features. - * @param genomeLocParser The parser to supply. - */ - public void setGenomeLocParser(GenomeLocParser genomeLocParser); -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/ReferenceOrderedDatum.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/ReferenceOrderedDatum.java deleted file mode 100644 index 95de83208..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/ReferenceOrderedDatum.java +++ /dev/null @@ -1,66 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata; - -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.HasGenomeLocation; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; - -/** - * Created by IntelliJ IDEA. - * User: mdepristo - * Date: Feb 27, 2009 - * Time: 10:49:47 AM - * To change this template use File | Settings | File Templates. - */ -public interface ReferenceOrderedDatum extends Comparable, HasGenomeLocation { - public String getName(); - public boolean parseLine(final Object header, final String[] parts) throws IOException; - public String toString(); - public String toSimpleString(); - public String repl(); - - /** - * Used by the ROD system to determine how to split input lines - * @return Regex string delimiter separating fields - */ - public String delimiterRegex(); - - public GenomeLoc getLocation(); - public int compareTo( ReferenceOrderedDatum that ); - - /** - * Backdoor hook to read header, meta-data, etc. associated with the file. Will be - * called by the ROD system before streaming starts - * - * @param source source data file on disk from which this rod stream will be pulled - * @return a header object that will be passed to parseLine command - */ - public Object initialize(final File source) throws FileNotFoundException; -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/SeekableRODIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/SeekableRODIterator.java deleted file mode 100644 index 4126214cf..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/SeekableRODIterator.java +++ /dev/null @@ -1,412 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata; - -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.util.CloseableIterator; -import org.broadinstitute.gatk.engine.iterators.PushbackIterator; -import org.broadinstitute.gatk.engine.refdata.utils.GATKFeature; -import org.broadinstitute.gatk.engine.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; - -/** - * Wrapper class for iterators over ROD objects. It is assumed that the underlying iterator can only - * perform standard next() operation, which advances it to the next ROD in the stream (i.e. reads the data file - * line by line). This iterator 1) shifts the focus from record-based traversal to position-based traversal, - * and 2) adds querying seekForward() method. - * - * Namely, this iterator's next() method advances not to the next ROD in the underlying stream, but to the next - * genomic position covered by (at least one) ROD, and returns all RODs overlapping with that position as a RODRecordList - * collection-like object. Similarly, when seekForward(interval) is called, this iterator skips all the RODs from the - * underlying stream, until it reaches specified genomic interval, and returns the list of all RODs overlapping with that interval. - * - * NOTE: this iterator has a STATE: next() operation is not allowed after a seekForward() to a non-point (extended) interval - * of length > 1. Such a call would leave the iterator in an inconsistent state. seekForward() can always be called after - * either seekForward() or next() (as long as usual ordering criteria are satisfied: the query interval location can neither - * start before the current position, nor end before the previous query end). seekForward to an interval of length 1 - * reenables next() operation. - * - * Created by IntelliJ IDEA. - * User: asivache - * Date: Sep 10, 2009 - * Time: 6:20:46 PM - * To change this template use File | Settings | File Templates. - */ -public class SeekableRODIterator implements LocationAwareSeekableRODIterator { - /** - * Header for the datasource backing this iterator. - */ - private final Object header; - - /** - * The parser, used to construct new genome locs. - */ - private final GenomeLocParser parser; - - private final SAMSequenceDictionary sequenceDictionary; - - private PushbackIterator it; - List records = null; // here we will keep a pile of records overlaping with current position; when we iterate - // and step out of record's scope, we purge it from the list - String name = null; // name of the ROD track wrapped by this iterator. Will be pulled from underlying iterator. - - int curr_position = 0; // where the iterator is currently positioned on the genome - int max_position = 0; // the rightmost stop position of currently loaded records - String curr_contig = null; // what contig the iterator is currently on - boolean next_is_allowed = true; // see discussion below. next() is illegal after seek-forward queries of length > 1 - - // the stop position of the last query. We can query only in forward direction ("seek forward"); - // it is not only the start position of every successive query that can not be before the start - // of the previous one (curr_start), but it is also illegal for a query interval to *end* before - // the end of previous query, otherwise we can end up in an inconsistent state - int curr_query_end = -1; - - // EXAMPLE of inconsistency curr_query_end guards against: - // record 1 record 2 - // ---------- ----------- - // -------------------------------------------------- REF - // ------------------------- query 1 (interval 1) - // ---------- query 2 (interval 2) - // --------------- query 3 - // - // If we query first for interval 1, both record 1 and record 2 will be loaded. - // Query for interval 2, on the other hand, should return only record 1, but after - // query 1 was performed, record 2 is already loaded from the file. If, on the other hand, - // we try to un-load it from memory, we won't be able to read it again. Hence query 2 is not - // allowed after query 1. Note also, that curr_query_end is not equivalent to max_position: - // the latter only tracks where currently loaded records end (and hence helps to re-load records); - // after query 1 is performed, max_position will be the end of record 2, but query 3 is still - // perfectly legal after query 1. - // - // IMPORTANT NOTE: it follows from the above discussion and example that next() is illegal after ANY - // seek-forward query EXCEPT those that are performed with length-1 intervals (queryInterval.start=queryinteval.stop). - // Indeed, in the example above, after, e.g., query 1 is performed, the iterator is "located" at the start - // of interval 1, but record1 and record 2 are already loaded. On the other hand, a subsequent call to next() would - // need to shift iterator's position by 1 base and return only record 1. - // - // This implementation tracks the query history and makes next() illegal after a seekforward query of length > 1, - // but re-enables next() again after a length-1 query. - - public SeekableRODIterator(Object header,SAMSequenceDictionary rodDictionary,SAMSequenceDictionary referenceDictionary,GenomeLocParser parser,CloseableIterator it) { - this.header = header; - this.parser = parser; - this.sequenceDictionary = rodDictionary; - this.it = new PushbackIterator(it); - records = new LinkedList(); - // the following is a trick: we would like the iterator to know the actual name assigned to - // the ROD implementing object we are working with. But the only way to do that is to - // get an instance of that ROD and query it for its name. Now, the only generic way we have at this point to instantiate - // the ROD is to make the underlying stream iterator to do it for us. So we are reading (or rather peeking into) - // the first line of the track data file just to get the ROD object created. - GATKFeature r = null; - if (this.it.hasNext()) r = this.it.element(); - name = (r==null?null:r.getName()); - - curr_contig = referenceDictionary.getSequence(0).getSequenceName(); - } - - /** - * Gets the header associated with the backing input stream. - * @return the ROD header. - */ - @Override - public Object getHeader() { - return header; - } - - /** - * Gets the sequence dictionary associated with the backing input stream. - * @return sequence dictionary from the ROD header. - */ - @Override - public SAMSequenceDictionary getSequenceDictionary() { - return sequenceDictionary; - } - - - /** - * Returns true if the data we iterate over has records associated with (any, not necessarily adjacent) - * genomic position farther along the reference. - * @return - */ - public boolean hasNext() { - - // if we did not walk to the very end of the interval(s) covered by currently loaded - // annotations (records), then we definitely have data for next genomic location - if ( curr_position < max_position ) return true; - - // we are past currently loaded stuff; we have next if there are more lines to load: - return it.hasNext(); - } - - // Returns point location (i.e. genome loc of length 1) on the reference, to which this iterator will advance - // upon next call to next(). - public GenomeLoc peekNextLocation() { - if ( curr_position + 1 <= max_position ) return parser.createGenomeLoc(curr_contig,curr_position+1); - - // sorry, next reference position is not covered by the RODs we are currently holding. In this case, - // the location we will jump to upon next call to next() is the start of the next ROD record that we did - // not read yet: - if ( it.hasNext() ) { - GATKFeature r = it.element(); // peek, do not load! - return parser.createGenomeLoc(r.getLocation().getContig(),r.getLocation().getStart()); - } - return null; // underlying iterator has no more records, there is no next location! - } - - /** Advances iterator to the next genomic position that has ROD record(s) associated with it, - * and returns all the records overlapping with that position as a RODList. The location of the whole - * RODList object will be set to the smallest interval subsuming genomic intervals of all returned records. - * Note that next() is disabled (will throw an exception) after seekForward() operation with query length > 1. - * @return list of all RODs overlapping with the next "covered" genomic position - */ - public RODRecordList next() { - if ( ! next_is_allowed ) - throw new ReviewedGATKException("Illegal use of iterator: Can not advance iterator with next() after seek-forward query of length > 1"); - - curr_position++; - // curr_query_end = -1; - - if ( curr_position <= max_position ) { - - // we still have bases covered by at least one currently loaded record; - // we have to purge only subset of records, on which we moved past the end - purgeOutOfScopeRecords(); - } else { - // ooops, we are past the end of all loaded records - kill them all at once, - // load next record and reinitialize by fastforwarding current position to the start of next record - records.clear(); - GATKFeature r = it.next(); // if hasNext() previously returned true, we are guaranteed that this call to reader.next() is safe - records.add( r ); - curr_contig = r.getLocation().getContig(); - curr_position = r.getLocation().getStart(); - max_position = r.getLocation().getStop(); - } - - // current position is ste and at this point 'records' only keeps those annotations, on which we did not reach the end yet - // (we might have reloaded records completely if it was necessary); but we are not guaranteed yet that we - // hold ALL the records overlapping with the current position. Time to check if we just walked into the interval(s) - // covered by new records, so we need to load them too: - - while ( it.hasNext() ) { - GATKFeature r = it.element(); - if ( r == null ) { - it.next(); - continue; - } - - GenomeLoc currentContig = parser.createOverEntireContig(curr_contig); - GenomeLoc thatContig = r.getLocation(); - - if ( currentContig.isPast(thatContig) ) - throw new UserException("LocationAwareSeekableRODIterator: contig " +r.getLocation().getContig() + - " occurs out of order in track " + r.getName() ); - if ( currentContig.isBefore(thatContig) ) break; // next record is on a higher contig, we do not need it yet... - - if ( r.getLocation().getStart() < curr_position ) - throw new UserException("LocationAwareSeekableRODIterator: track "+r.getName() + - " is out of coordinate order on contig "+r.getLocation() + " compared to " + curr_contig + ":" + curr_position); - - if ( r.getLocation().getStart() > curr_position ) break; // next record starts after the current position; we do not need it yet - - r = it.next(); // we got here only if we do need next record, time to load it for real - - int stop = r.getLocation().getStop(); - if ( stop < curr_position ) throw new ReviewedGATKException("DEBUG: encountered contig that should have been loaded earlier"); // this should never happen - if ( stop > max_position ) max_position = stop; // max_position keeps the rightmost stop position across all loaded records - records.add(r); - } - - // 'records' and current position are fully updated. Last, we need to set the location of the whole track - // (collection of ROD records) to the genomic site we are currently looking at, and return the list - - return new RODRecordListImpl(name,records, parser.createGenomeLoc(curr_contig,curr_position)); - } - - /** - * Removes from the underlying collection the last element returned by the - * iterator (optional operation). This method can be called only once per - * call to next. The behavior of an iterator is unspecified if - * the underlying collection is modified while the iteration is in - * progress in any way other than by calling this method. - * - * @throws UnsupportedOperationException if the remove - * operation is not supported by this Iterator. - * @throws IllegalStateException if the next method has not - * yet been called, or the remove method has already - * been called after the last call to the next - * method. - */ - public void remove() { - throw new UnsupportedOperationException("LocationAwareSeekableRODIterator does not implement remove() operation"); - } - - - /** - * Returns the current "position" (not location!! ;) ) of this iterator. This method is used by the sharding - * system when it searches for available iterators in the pool that can be reused to resume traversal. - * When iterator is advanced using next(), current position - * is the same as 'location'. However, after a seekForward() query with extended interval, returned position - * will be set to the last position of the query interval, to disable (illegal) attempts to roll the iterator - * back and re-start traversal from current location. - * @return Current ending position of the iterator, or null if no position exists. - */ - public GenomeLoc position() { - if ( curr_contig == null ) return null; - if ( curr_query_end > curr_position ) { - // do not attempt to reuse this iterator if the position we need it for lies before the end of last query performed - return parser.createGenomeLoc(curr_contig,curr_query_end,curr_query_end); - } - else { - return parser.createGenomeLoc(curr_contig,curr_position); - } - } - - /** - * Seeks forward through the file until the specified interval is reached. - * The location object interval can be either a single point or an extended interval. All - * ROD records overlapping with the whole interval will be returned, or null if no such records exist. - * - * Query interval must start at or after the iterator's current location, or exception will be thrown. - * - * Query interval must end at or after the stop position of the previous query, if any, or an exception will - * be thrown: subsequent queries that end before the stop of previous ones are illegal. - * - * If seekForward() is performed to an extended (length > 1 i.e. start != stop) interval, next() operation becomes - * illegal (the iterator changes state). Only seekForward() calls are allowed thereafter, until a seekForward() call - * to a length-1 interval is performed, which re-enables next(). seekForward() queries with length-1 intervals can - * always be safely intermixed with next() (as long as ordering is respected and query intervals are at or after the - * current position). - * - * Note that in contrast to - * next() (which always advances current position of the iterator on the reference), this method scrolls - * forward ONLY if the specified interval is ahead of the current location of - * the iterator. However, if called again with the same 'interval' argument as before, seekForward will NOT - * advance, but will simply return the same ROD list as before. - * - * - * @param interval point-like genomic location to fastforward to. - * @return ROD object at (or overlapping with) the specified position, or null if no such ROD exists. - */ - public RODRecordList seekForward(GenomeLoc interval) { - - if ( interval.isBefore(parser.createOverEntireContig(curr_contig)) && - !(interval.getStart() == 0 && interval.getStop() == 0 && interval.getContig().equals(curr_contig)) ) // This criteria is syntactic sugar for 'seek to right before curr_contig' - throw new ReviewedGATKException("Out of order query: query contig "+interval.getContig()+" is located before "+ - "the iterator's current contig"); - if ( interval.getContig().equals(curr_contig) ) { - if ( interval.getStart() < curr_position ) - throw new ReviewedGATKException("Out of order query: query position "+interval +" is located before "+ - "the iterator's current position "+curr_contig + ":" + curr_position); - if ( interval.getStop() < curr_query_end ) - throw new ReviewedGATKException("Unsupported querying sequence: current query interval " + - interval+" ends before the end of previous query interval ("+curr_query_end+")"); - } - - curr_position = interval.getStart(); - curr_query_end = interval.getStop(); - - next_is_allowed = ( curr_position == curr_query_end ); // we can call next() later only if interval length is 1 - - if ( interval.getContig().equals(curr_contig) && curr_position <= max_position ) { - // some of the intervals we are currently keeping do overlap with the query interval - - purgeOutOfScopeRecords(); - } else { - // clean up and get ready for fast-forwarding towards the requested position - records.clear(); - max_position = -1; - curr_contig = interval.getContig(); - } - - // curr_contig and curr_position are set to where we asked to scroll to - - while ( it.hasNext() ) { - GATKFeature r = it.next(); - if ( r == null ) continue; - - GenomeLoc currentContig = parser.createOverEntireContig(curr_contig); - GenomeLoc thatContig = r.getLocation(); - - if ( currentContig.isPast(thatContig) ) continue; // did not reach requested contig yet - if ( currentContig.isBefore(thatContig) ) { - it.pushback(r); // next record is on the higher contig, we do not need it yet... - break; - } - - // we get here if we are on the requested contig: - - if ( r.getLocation().getStop() < curr_position ) continue; // did not reach the requested interval yet - - if ( r.getLocation().getStart() > curr_query_end ) { - // past the query interval - it.pushback(r); - break; - } - - // we get here only if interval of the record r overlaps with query interval, so the record should be loaded - if ( r.getLocation().getStop() > max_position ) max_position = r.getLocation().getStop(); - records.add(r); - } - - if ( records.size() > 0 ) { - return new RODRecordListImpl(name,records,interval); - } else { - return null; - } - - } - - /** - * Removes records that end before the curr_position from the list of currently kept records. This is a - * convenience (private) shortcut that does not perform extensive checking. In particular, it assumes that - * curr_position <= max_position, as well as that we are still on the same contig. - */ - private void purgeOutOfScopeRecords() { - Iterator i = records.iterator(); - while ( i.hasNext() ) { - GATKFeature r = i.next(); - if ( r.getLocation().getStop() < curr_position ) { - i.remove(); // we moved past the end of interval the record r is associated with, purge the record forever - } - } - - } - - @Override - public void close() { - if (this.it != null) ((CloseableIterator)this.it.getUnderlyingIterator()).close(); - } - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/VariantContextAdaptors.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/VariantContextAdaptors.java deleted file mode 100644 index 82a826c10..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/VariantContextAdaptors.java +++ /dev/null @@ -1,399 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata; - -import htsjdk.samtools.util.SequenceUtil; -import htsjdk.tribble.Feature; -import htsjdk.tribble.annotation.Strand; -import htsjdk.tribble.dbsnp.OldDbSNPFeature; -import htsjdk.tribble.gelitext.GeliTextFeature; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.classloader.PluginManager; -import org.broadinstitute.gatk.utils.codecs.hapmap.RawHapMapFeature; -import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; -import htsjdk.variant.variantcontext.*; - -import java.util.*; - -/** - * A terrible but temporary approach to converting objects to VariantContexts. If you want to add a converter, - * you need to create a adaptor object here and register a converter from your class to this object. When tribble arrives, - * we'll use a better approach. - * - * To add a new converter: - * - * create a subclass of VCAdaptor, overloading the convert operator - * add it to the static map from input type -> converter where the input type is the object.class you want to convert - * - * That's it - * - * @author depristo@broadinstitute.org - */ -public class VariantContextAdaptors { - // -------------------------------------------------------------------------------------------------------------- - // - // Generic support routines. Do not modify - // - // -------------------------------------------------------------------------------------------------------------- - - private static Map,VCAdaptor> adaptors = new HashMap,VCAdaptor>(); - - static { - PluginManager vcAdaptorManager = new PluginManager(VCAdaptor.class); - List adaptorInstances = vcAdaptorManager.createAllTypes(); - for(VCAdaptor adaptor: adaptorInstances) - adaptors.put(adaptor.getAdaptableFeatureType(),adaptor); - } - - public static boolean canBeConvertedToVariantContext(Object variantContainingObject) { - return adaptors.containsKey(variantContainingObject.getClass()); - } - - /** generic superclass */ - public interface VCAdaptor { - /** - * Gets the type of feature that this adaptor can 'adapt' into a VariantContext. - * @return Type of adaptable feature. Must be a Tribble feature class. - */ - Class getAdaptableFeatureType(); - VariantContext convert(String name, Object input, ReferenceContext ref); - } - - public static VariantContext toVariantContext(String name, Object variantContainingObject, ReferenceContext ref) { - if ( ! adaptors.containsKey(variantContainingObject.getClass()) ) - return null; - else { - return adaptors.get(variantContainingObject.getClass()).convert(name, variantContainingObject, ref); - } - } - - // -------------------------------------------------------------------------------------------------------------- - // - // From here below you can add adaptor classes for new rods (or other types) to convert to VC - // - // -------------------------------------------------------------------------------------------------------------- - private static class VariantContextAdaptor implements VCAdaptor { - /** - * 'Null' adaptor; adapts variant contexts to variant contexts. - * @return VariantContext. - */ - @Override - public Class getAdaptableFeatureType() { return VariantContext.class; } - - // already a VC, just cast and return it - @Override - public VariantContext convert(String name, Object input, ReferenceContext ref) { - return (VariantContext)input; - } - } - - // -------------------------------------------------------------------------------------------------------------- - // - // dbSNP to VariantContext - // - // -------------------------------------------------------------------------------------------------------------- - - private static class DBSnpAdaptor implements VCAdaptor { - private static boolean isSNP(OldDbSNPFeature feature) { - return feature.getVariantType().contains("single") && feature.getLocationType().contains("exact"); - } - - private static boolean isMNP(OldDbSNPFeature feature) { - return feature.getVariantType().contains("mnp") && feature.getLocationType().contains("range"); - } - - private static boolean isInsertion(OldDbSNPFeature feature) { - return feature.getVariantType().contains("insertion"); - } - - private static boolean isDeletion(OldDbSNPFeature feature) { - return feature.getVariantType().contains("deletion"); - } - - private static boolean isIndel(OldDbSNPFeature feature) { - return isInsertion(feature) || isDeletion(feature) || isComplexIndel(feature); - } - - public static boolean isComplexIndel(OldDbSNPFeature feature) { - return feature.getVariantType().contains("in-del"); - } - - /** - * gets the alternate alleles. This method should return all the alleles present at the location, - * NOT including the reference base. This is returned as a string list with no guarantee ordering - * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest - * frequency). - * - * @return an alternate allele list - */ - public static List getAlternateAlleleList(OldDbSNPFeature feature) { - List ret = new ArrayList(); - for (String allele : getAlleleList(feature)) - if (!allele.equals(String.valueOf(feature.getNCBIRefBase()))) ret.add(allele); - return ret; - } - - /** - * gets the alleles. This method should return all the alleles present at the location, - * including the reference base. The first allele should always be the reference allele, followed - * by an unordered list of alternate alleles. - * - * @return an alternate allele list - */ - public static List getAlleleList(OldDbSNPFeature feature) { - List alleleList = new ArrayList(); - // add ref first - if ( feature.getStrand() == Strand.POSITIVE ) - alleleList = Arrays.asList(feature.getObserved()); - else - for (String str : feature.getObserved()) - alleleList.add(SequenceUtil.reverseComplement(str)); - if ( alleleList.size() > 0 && alleleList.contains(feature.getNCBIRefBase()) - && !alleleList.get(0).equals(feature.getNCBIRefBase()) ) - Collections.swap(alleleList, alleleList.indexOf(feature.getNCBIRefBase()), 0); - - return alleleList; - } - - /** - * Converts non-VCF formatted dbSNP records to VariantContext. - * @return OldDbSNPFeature. - */ - @Override - public Class getAdaptableFeatureType() { return OldDbSNPFeature.class; } - - @Override - public VariantContext convert(String name, Object input, ReferenceContext ref) { - OldDbSNPFeature dbsnp = (OldDbSNPFeature)input; - - int index = dbsnp.getStart() - ref.getWindow().getStart() - 1; - if ( index < 0 ) - return null; // we weren't given enough reference context to create the VariantContext - - final byte refBaseForIndel = ref.getBases()[index]; - final boolean refBaseIsDash = dbsnp.getNCBIRefBase().equals("-"); - - boolean addPaddingBase; - if ( isSNP(dbsnp) || isMNP(dbsnp) ) - addPaddingBase = false; - else if ( isIndel(dbsnp) || dbsnp.getVariantType().contains("mixed") ) - addPaddingBase = refBaseIsDash || GATKVariantContextUtils.requiresPaddingBase(stripNullDashes(getAlleleList(dbsnp))); - else - return null; // can't handle anything else - - Allele refAllele; - if ( refBaseIsDash ) - refAllele = Allele.create(refBaseForIndel, true); - else if ( ! Allele.acceptableAlleleBases(dbsnp.getNCBIRefBase()) ) - return null; - else - refAllele = Allele.create((addPaddingBase ? (char)refBaseForIndel : "") + dbsnp.getNCBIRefBase(), true); - - final List alleles = new ArrayList(); - alleles.add(refAllele); - - // add all of the alt alleles - for ( String alt : getAlternateAlleleList(dbsnp) ) { - if ( Allele.wouldBeNullAllele(alt.getBytes())) - alt = ""; - else if ( ! Allele.acceptableAlleleBases(alt) ) - return null; - - alleles.add(Allele.create((addPaddingBase ? (char)refBaseForIndel : "") + alt, false)); - } - - final VariantContextBuilder builder = new VariantContextBuilder(); - builder.source(name).id(dbsnp.getRsID()); - builder.loc(dbsnp.getChr(), dbsnp.getStart() - (addPaddingBase ? 1 : 0), dbsnp.getEnd() - (addPaddingBase && refAllele.length() == 1 ? 1 : 0)); - builder.alleles(alleles); - return builder.make(); - } - - private static List stripNullDashes(final List alleles) { - final List newAlleles = new ArrayList(alleles.size()); - for ( final String allele : alleles ) { - if ( allele.equals("-") ) - newAlleles.add(""); - else - newAlleles.add(allele); - } - return newAlleles; - } - } - - // -------------------------------------------------------------------------------------------------------------- - // - // GELI to VariantContext - // - // -------------------------------------------------------------------------------------------------------------- - - private static class GeliTextAdaptor implements VCAdaptor { - /** - * Converts Geli text records to VariantContext. - * @return GeliTextFeature. - */ - @Override - public Class getAdaptableFeatureType() { return GeliTextFeature.class; } - - /** - * convert to a Variant Context, given: - * @param name the name of the ROD - * @param input the Rod object, in this case a RodGeliText - * @param ref the reference context - * @return a VariantContext object - */ - @Override - public VariantContext convert(String name, Object input, ReferenceContext ref) { - GeliTextFeature geli = (GeliTextFeature)input; - if ( ! Allele.acceptableAlleleBases(String.valueOf(geli.getRefBase())) ) - return null; - Allele refAllele = Allele.create(String.valueOf(geli.getRefBase()), true); - - // make sure we can convert it - if ( geli.getGenotype().isHet() || !geli.getGenotype().containsBase(geli.getRefBase())) { - // add the reference allele - List alleles = new ArrayList(); - List genotypeAlleles = new ArrayList(); - // add all of the alt alleles - for ( char alt : geli.getGenotype().toString().toCharArray() ) { - if ( ! Allele.acceptableAlleleBases(String.valueOf(alt)) ) { - return null; - } - Allele allele = Allele.create(String.valueOf(alt), false); - if (!alleles.contains(allele) && !refAllele.basesMatch(allele.getBases())) alleles.add(allele); - - // add the allele, first checking if it's reference or not - if (!refAllele.basesMatch(allele.getBases())) genotypeAlleles.add(allele); - else genotypeAlleles.add(refAllele); - } - - Map attributes = new HashMap(); - Collection genotypes = new ArrayList(); - Genotype call = GenotypeBuilder.create(name, genotypeAlleles); - - // add the call to the genotype list, and then use this list to create a VariantContext - genotypes.add(call); - alleles.add(refAllele); - GenomeLoc loc = ref.getGenomeLocParser().createGenomeLoc(geli.getChr(),geli.getStart()); - return new VariantContextBuilder(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles).genotypes(genotypes).log10PError(-1 * geli.getLODBestToReference()).attributes(attributes).make(); - } else - return null; // can't handle anything else - } - } - - // -------------------------------------------------------------------------------------------------------------- - // - // HapMap to VariantContext - // - // -------------------------------------------------------------------------------------------------------------- - - private static class HapMapAdaptor implements VCAdaptor { - /** - * Converts HapMap records to VariantContext. - * @return HapMapFeature. - */ - @Override - public Class getAdaptableFeatureType() { return RawHapMapFeature.class; } - - /** - * convert to a Variant Context, given: - * @param name the name of the ROD - * @param input the Rod object, in this case a RodGeliText - * @param ref the reference context - * @return a VariantContext object - */ - @Override - public VariantContext convert(String name, Object input, ReferenceContext ref) { - if ( ref == null ) - throw new UnsupportedOperationException("Conversion from HapMap to VariantContext requires a reference context"); - - RawHapMapFeature hapmap = (RawHapMapFeature)input; - - int index = hapmap.getStart() - ref.getWindow().getStart(); - if ( index < 0 ) - return null; // we weren't given enough reference context to create the VariantContext - - HashSet alleles = new HashSet(); - Allele refSNPAllele = Allele.create(ref.getBase(), true); - int deletionLength = -1; - - Map alleleMap = hapmap.getActualAlleles(); - // use the actual alleles, if available - if ( alleleMap != null ) { - alleles.addAll(alleleMap.values()); - Allele deletionAllele = alleleMap.get(RawHapMapFeature.INSERTION); // yes, use insertion here (since we want the reference bases) - if ( deletionAllele != null && deletionAllele.isReference() ) - deletionLength = deletionAllele.length(); - } else { - // add the reference allele for SNPs - alleles.add(refSNPAllele); - } - - // make a mapping from sample to genotype - String[] samples = hapmap.getSampleIDs(); - String[] genotypeStrings = hapmap.getGenotypes(); - - GenotypesContext genotypes = GenotypesContext.create(samples.length); - for ( int i = 0; i < samples.length; i++ ) { - // ignore bad genotypes - if ( genotypeStrings[i].contains("N") ) - continue; - - String a1 = genotypeStrings[i].substring(0,1); - String a2 = genotypeStrings[i].substring(1); - ArrayList myAlleles = new ArrayList(2); - - // use the mapping to actual alleles, if available - if ( alleleMap != null ) { - myAlleles.add(alleleMap.get(a1)); - myAlleles.add(alleleMap.get(a2)); - } else { - // ignore indels (which we can't handle without knowing the alleles) - if ( genotypeStrings[i].contains("I") || genotypeStrings[i].contains("D") ) - continue; - - Allele allele1 = Allele.create(a1, refSNPAllele.basesMatch(a1)); - Allele allele2 = Allele.create(a2, refSNPAllele.basesMatch(a2)); - - myAlleles.add(allele1); - myAlleles.add(allele2); - alleles.add(allele1); - alleles.add(allele2); - } - - Genotype g = GenotypeBuilder.create(samples[i], myAlleles); - genotypes.add(g); - } - - long end = hapmap.getEnd(); - if ( deletionLength > 0 ) - end += (deletionLength - 1); - VariantContext vc = new VariantContextBuilder(name, hapmap.getChr(), hapmap.getStart(), end, alleles).id(hapmap.getName()).genotypes(genotypes).make(); - return vc; - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/package-info.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/package-info.java deleted file mode 100644 index e9e9714cb..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/package-info.java +++ /dev/null @@ -1,26 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata; \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/FeatureManager.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/FeatureManager.java deleted file mode 100644 index d466f3f1e..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/FeatureManager.java +++ /dev/null @@ -1,280 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.tracks; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import htsjdk.tribble.Feature; -import htsjdk.tribble.FeatureCodec; -import htsjdk.tribble.NameAwareCodec; -import org.broadinstitute.gatk.engine.refdata.ReferenceDependentFeatureCodec; -import org.broadinstitute.gatk.engine.refdata.utils.RMDTriplet; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.classloader.PluginManager; -import htsjdk.variant.vcf.AbstractVCFCodec; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.help.GATKDocUtils; - -import java.io.File; -import java.util.*; - - -/** - * Class for managing Tribble Feature readers available to the GATK. The features - * are dynamically determined via a PluginManager. This class provides convenient - * getter methods for obtaining FeatureDescriptor objects that collect all of the - * useful information about the Tribble Codec, Feature, and name in one place. - * - * @author depristo - */ -public class FeatureManager { - public static class FeatureDescriptor implements Comparable { - final String name; - final FeatureCodec codec; - - public FeatureDescriptor(final String name, final FeatureCodec codec) { - this.name = name; - this.codec = codec; - } - - public String getName() { - return name; - } - public String getSimpleFeatureName() { return getFeatureClass().getSimpleName(); } - public FeatureCodec getCodec() { - return codec; - } - public Class getCodecClass() { return codec.getClass(); } - public Class getFeatureClass() { return codec.getFeatureType(); } - - @Override - public String toString() { - return String.format("FeatureDescriptor name=%s codec=%s feature=%s", - getName(), getCodecClass().getName(), getFeatureClass().getName()); - } - - @Override - public int compareTo(FeatureDescriptor o) { - return getName().compareTo(o.getName()); - } - } - - private final PluginManager pluginManager; - private final Collection featureDescriptors = new TreeSet(); - private final boolean lenientVCFProcessing; - - /** - * Construct a FeatureManager without a master VCF header - */ - public FeatureManager() { - this(false); - } - - public FeatureManager(final boolean lenientVCFProcessing) { - this.lenientVCFProcessing = lenientVCFProcessing; - pluginManager = new PluginManager(FeatureCodec.class, "Codecs", "Codec"); - - for (final String rawName: pluginManager.getPluginsByName().keySet()) { - FeatureCodec codec = pluginManager.createByName(rawName); - String name = rawName.toUpperCase(); - FeatureDescriptor featureDescriptor = new FeatureDescriptor(name, codec); - featureDescriptors.add(featureDescriptor); - } - } - - /** - * Return the FeatureDescriptor whose getCodecClass().equals(codecClass). - * - * @param codecClass - * @return A FeatureDescriptor or null if none is found - */ - @Requires("codecClass != null") - public FeatureDescriptor getByCodec(Class codecClass) { - for ( FeatureDescriptor descriptor : featureDescriptors ) - if ( descriptor.getCodecClass().equals(codecClass) ) - return descriptor; - return null; - } - - /** - * Returns a collection of FeatureDescriptors that emit records of type featureClass - * - * @param featureClass - * @return A FeatureDescriptor or null if none is found - */ - @Requires("featureClass != null") - public Collection getByFeature(Class featureClass) { - Set consistentDescriptors = new TreeSet(); - - if (featureClass == null) - throw new IllegalArgumentException("trackRecordType value is null, please pass in an actual class object"); - - for ( FeatureDescriptor descriptor : featureDescriptors ) { - if ( featureClass.isAssignableFrom(descriptor.getFeatureClass())) - consistentDescriptors.add(descriptor); - } - return consistentDescriptors; - } - - /** - * Return the FeatureDescriptor with getID().equals(name) - * - * @param name - * @return A FeatureDescriptor or null if none is found - */ - @Requires("name != null") - public FeatureDescriptor getByName(String name) { - for ( FeatureDescriptor descriptor : featureDescriptors ) - if ( descriptor.getName().equalsIgnoreCase(name) ) - return descriptor; - return null; - } - - /** - * Returns the FeatureDescriptor that can read the contexts of File file, is one can be determined - * - * @param file - * @return A FeatureDescriptor or null if none is found - */ - @Requires({"file != null", "file.isFile()", "file.canRead()"}) - public FeatureDescriptor getByFiletype(File file) { - List canParse = new ArrayList(); - for ( FeatureDescriptor descriptor : featureDescriptors ) - if ( descriptor.getCodec().canDecode(file.getPath()) ) { - canParse.add(descriptor); - } - - if ( canParse.size() == 0 ) - return null; - else if ( canParse.size() > 1 ) - throw new ReviewedGATKException("BUG: multiple feature descriptors can read file " + file + ": " + canParse); - else - return canParse.get(0); - } - - /** - * Returns the FeatureDescriptor associated with the type described by triplet, or null if none is found - * @param triplet - * @return - */ - @Requires("triplet != null") - public FeatureDescriptor getByTriplet(RMDTriplet triplet) { - return getByName(triplet.getType()); - } - - /** - * @return all of the FeatureDescriptors available to the GATK. Never null - */ - @Ensures("result != null") - public Collection getFeatureDescriptors() { - return Collections.unmodifiableCollection(featureDescriptors); - } - - - /** - * Returns a list of the available tribble track names (vcf,dbsnp,etc) that we can load - * @return - */ - @Ensures("result != null") - public String userFriendlyListOfAvailableFeatures() { - return userFriendlyListOfAvailableFeatures(Feature.class); - } - - /** - * Returns a list of the available tribble track names (vcf,dbsnp,etc) that we can load - * restricted to only Codecs producting Features consistent with the requiredFeatureType - * @return - */ - @Ensures("result != null") - public String userFriendlyListOfAvailableFeatures(Class requiredFeatureType) { - final String nameHeader="Name", featureHeader = "FeatureType", docHeader="Documentation"; - - int maxNameLen = nameHeader.length(), maxFeatureNameLen = featureHeader.length(); - for ( final FeatureDescriptor descriptor : featureDescriptors ) { - if ( requiredFeatureType.isAssignableFrom(descriptor.getFeatureClass()) ) { - maxNameLen = Math.max(maxNameLen, descriptor.getName().length()); - maxFeatureNameLen = Math.max(maxFeatureNameLen, descriptor.getSimpleFeatureName().length()); - } - } - - StringBuilder docs = new StringBuilder(); - String format = "%" + maxNameLen + "s %" + maxFeatureNameLen + "s %s%n"; - docs.append(String.format(format, nameHeader, featureHeader, docHeader)); - for ( final FeatureDescriptor descriptor : featureDescriptors ) { - if ( requiredFeatureType.isAssignableFrom(descriptor.getFeatureClass()) ) { - final String DocURL = GATKDocUtils.helpLinksToGATKDocs(descriptor.getCodecClass()); - final String oneDoc; - if ( DocURL.contains("_sting_") ) { - oneDoc = String.format(format, - descriptor.getName(), - descriptor.getSimpleFeatureName(), - DocURL); - } else { - oneDoc = String.format(format, - descriptor.getName(), - descriptor.getSimpleFeatureName(), - "(this is an external codec and is not documented within GATK)"); - } - - docs.append(oneDoc); - } - } - - return docs.toString(); - } - - /** - * Create a new FeatureCodec of the type described in descriptor, assigning it the - * name (if possible) and providing it the genomeLocParser (where necessary) - * - * @param descriptor FeatureDescriptor of the Tribble FeatureCodec we want to create - * @param name the name to assign this codec - * @param genomeLocParser GenomeLocParser for ReferenceDependentFeatureCodecs - * @param remappedSampleName replacement sample name for single-sample vcfs, or null if we're not performing - * sample name remapping - * @return the feature codec itself - */ - @Requires({"descriptor != null", "name != null", "genomeLocParser != null"}) - @Ensures("result != null") - public FeatureCodec createCodec(final FeatureDescriptor descriptor, final String name, final GenomeLocParser genomeLocParser, - final String remappedSampleName) { - FeatureCodec codex = pluginManager.createByType(descriptor.getCodecClass()); - if ( codex instanceof NameAwareCodec ) - ((NameAwareCodec)codex).setName(name); - if ( codex instanceof ReferenceDependentFeatureCodec ) - ((ReferenceDependentFeatureCodec)codex).setGenomeLocParser(genomeLocParser); - if ( codex instanceof AbstractVCFCodec ) { - if ( lenientVCFProcessing ) { - ((AbstractVCFCodec)codex).disableOnTheFlyModifications(); - } - if ( remappedSampleName != null ) { - ((AbstractVCFCodec)codex).setRemappedSampleName(remappedSampleName); - } - } - - return codex; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/IndexDictionaryUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/IndexDictionaryUtils.java deleted file mode 100644 index 5c18d3a8e..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/IndexDictionaryUtils.java +++ /dev/null @@ -1,114 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.tracks; - -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.SAMSequenceRecord; -import org.apache.log4j.Logger; -import htsjdk.tribble.index.Index; -import htsjdk.tribble.index.MutableIndex; -import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; -import org.broadinstitute.gatk.utils.SequenceDictionaryUtils; - -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; - -/** - * Utilities for working with Sequence Dictionaries embedded in tribble indices - * - * @author Your Name - * @since Date created - */ -public class IndexDictionaryUtils { - private final static Logger logger = Logger.getLogger(IndexDictionaryUtils.class); - - // a constant we use for marking sequence dictionary entries in the Tribble index property list - public static final String SequenceDictionaryPropertyPredicate = "DICT:"; - - /** - * get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index - * @param index the index file to use - * @return a SAMSequenceDictionary if available, null if unavailable - */ - public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) { - SAMSequenceDictionary dict = new SAMSequenceDictionary(); - for (Map.Entry entry : index.getProperties().entrySet()) { - if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate)) - dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()), - Integer.valueOf(entry.getValue()))); - } - return dict; - } - - /** - * create the sequence dictionary with the contig list; a backup approach - * @param index the index file to use - * @param dict the sequence dictionary to add contigs to - * @return the filled-in sequence dictionary - */ - static SAMSequenceDictionary createSequenceDictionaryFromContigList(final Index index, final SAMSequenceDictionary dict) { - final List seqNames = index.getSequenceNames(); - if (seqNames == null) { - return dict; - } - for (final String name : seqNames) { - SAMSequenceRecord seq = new SAMSequenceRecord(name, 0); - dict.addSequence(seq); - } - return dict; - } - - /** - * Sets the sequence dictionary of the given index. THE INDEX MUST BE MUTABLE (i.e. not Tabix). - * - * @param index the (mutable) index file to use - * @param dict the dictionary to use - */ - public static void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict) { - for ( SAMSequenceRecord seq : dict.getSequences() ) { - final String contig = IndexDictionaryUtils.SequenceDictionaryPropertyPredicate + seq.getSequenceName(); - final String length = String.valueOf(seq.getSequenceLength()); - ((MutableIndex)index).addProperty(contig, length); - } - } - - public static void validateTrackSequenceDictionary(final String trackName, - final SAMSequenceDictionary trackDict, - final SAMSequenceDictionary referenceDict, - final ValidationExclusion.TYPE validationExclusionType ) { - // if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation - if (trackDict == null || trackDict.size() == 0) - logger.warn("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation"); - else { - Set trackSequences = new TreeSet(); - for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences()) - trackSequences.add(dictionaryEntry.getSequenceName()); - SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict, false, null); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/RMDTrack.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/RMDTrack.java deleted file mode 100644 index 51cb8f443..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/RMDTrack.java +++ /dev/null @@ -1,147 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.tracks; - -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.util.CloseableIterator; -import org.apache.log4j.Logger; -import htsjdk.tribble.AbstractFeatureReader; -import htsjdk.tribble.CloseableTribbleIterator; -import htsjdk.tribble.Feature; -import htsjdk.tribble.FeatureCodec; -import org.broadinstitute.gatk.engine.refdata.utils.FeatureToGATKFeatureIterator; -import org.broadinstitute.gatk.engine.refdata.utils.GATKFeature; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.io.File; -import java.io.IOException; - - -/** - * @author aaron - *

    - * Class RMDTrack - *

    - * the basics of what a reference metadata track must contain. - */ -public class RMDTrack { - private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class); - - // the basics of a track: - private final Class type; // our type - private final String name; // the name - private final File file; // the associated file we create the reader from - - // our feature reader - allows queries - private AbstractFeatureReader reader; - - // our sequence dictionary, which can be null - private final SAMSequenceDictionary dictionary; - - /** - * Parser to use when creating/parsing GenomeLocs. - */ - private final GenomeLocParser genomeLocParser; - - // our codec type - private final FeatureCodec codec; - - public Class getType() { - return type; - } - - public String getName() { - return name; - } - - public File getFile() { - return file; - } - - /** - * Create a track - * - * @param type the type of track, used for track lookup - * @param name the name of this specific track - * @param file the associated file, for reference or recreating the reader - * @param reader the feature reader to use as the underlying data source - * @param dict the sam sequence dictionary - * @param codec the feature codec we use to decode this type - */ - public RMDTrack(Class type, String name, File file, AbstractFeatureReader reader, SAMSequenceDictionary dict, GenomeLocParser genomeLocParser, FeatureCodec codec) { - this.type = type; - this.name = name; - this.file = file; - this.reader = reader; - this.dictionary = dict; - this.genomeLocParser = genomeLocParser; - this.codec = codec; - } - - /** - * @return how to get an iterator of the underlying data. This is all a track has to support, - * but other more advanced tracks support the query interface - */ - public CloseableIterator getIterator() { - try { - return new FeatureToGATKFeatureIterator(genomeLocParser,reader.iterator(),this.getName()); - } catch (IOException e) { - throw new UserException.CouldNotReadInputFile(getFile(), "Unable to read from file", e); - } - } - - public CloseableIterator query(GenomeLoc interval) throws IOException { - CloseableTribbleIterator iter = reader.query(interval.getContig(),interval.getStart(),interval.getStop()); - return new FeatureToGATKFeatureIterator(genomeLocParser, iter, this.getName()); - } - - public void close() { - try { - reader.close(); - } catch (IOException e) { - throw new UserException.MalformedFile("Unable to close reader " + reader.toString(),e); - } - reader = null; - } - - /** - * get the sequence dictionary from the track, if available - * @return a SAMSequenceDictionary if available, null if unavailable - */ - public SAMSequenceDictionary getSequenceDictionary() { - return dictionary; - } - - public Object getHeader() { - return reader.getHeader(); - } - - public FeatureCodec getCodec() { - return codec; - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/RMDTrackBuilder.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/RMDTrackBuilder.java deleted file mode 100644 index dc9e96728..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/RMDTrackBuilder.java +++ /dev/null @@ -1,430 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.tracks; - -import htsjdk.samtools.SAMSequenceDictionary; -import org.apache.log4j.Logger; -import htsjdk.tribble.AbstractFeatureReader; -import htsjdk.tribble.FeatureCodec; -import htsjdk.tribble.Tribble; -import htsjdk.tribble.TribbleException; -import htsjdk.tribble.index.Index; -import htsjdk.tribble.index.IndexFactory; -import htsjdk.tribble.util.LittleEndianOutputStream; -import org.broadinstitute.gatk.utils.commandline.Tags; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; -import org.broadinstitute.gatk.engine.io.stubs.VCFWriterArgumentTypeDescriptor; -import org.broadinstitute.gatk.engine.refdata.utils.RMDTriplet; -import org.broadinstitute.gatk.engine.refdata.utils.RMDTriplet.RMDStorageType; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.file.FSLockWithShared; -import org.broadinstitute.gatk.utils.instrumentation.Sizeof; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.util.Map; - - -/** - * - * @author aaron - * ` - * Class RMDTrackBuilder - * - * This class keeps track of the available codecs, and knows how to put together a track of - * that gets iterators from the FeatureReader using Tribble. - * - */ -public class RMDTrackBuilder { // extends PluginManager { - /** - * our log, which we use to capture anything from this class - */ - private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class); - - // private sequence dictionary we use to set our tracks with - private final SAMSequenceDictionary dict; - - /** - * Private genome loc parser to use when building out new locs. - */ - private final GenomeLocParser genomeLocParser; - - /** - * Validation exclusions, for validating the sequence dictionary. - */ - private ValidationExclusion.TYPE validationExclusionType; - - private final FeatureManager featureManager; - - // If true, do not attempt to create index files if they don't exist or are outdated, and don't - // make any file lock acquisition calls on the index files. - private final boolean disableAutoIndexCreation; - - // Map of file name -> new sample name used when performing on-the-fly sample renaming - private final Map sampleRenameMap; - - /** - * Construct an RMDTrackerBuilder, allowing the user to define tracks to build after-the-fact. This is generally - * used when walkers want to directly manage the ROD system for whatever reason. Before using this constructor, - * please talk through your approach with the SE team. - * @param dict Sequence dictionary to use. - * @param genomeLocParser Location parser to use. - * @param validationExclusionType Types of validations to exclude, for sequence dictionary verification. - * @param disableAutoIndexCreation Do not auto-create index files, and do not use file locking when accessing index files. - * UNSAFE in general (because it causes us not to lock index files before reading them) -- - * suitable only for test suite use. - * @param sampleRenameMap Map of file name -> new sample name used when performing on-the-fly sample renaming - */ - public RMDTrackBuilder(final SAMSequenceDictionary dict, - final GenomeLocParser genomeLocParser, - final ValidationExclusion.TYPE validationExclusionType, - final boolean disableAutoIndexCreation, - final Map sampleRenameMap) { - this.dict = dict; - this.validationExclusionType = validationExclusionType; - this.genomeLocParser = genomeLocParser; - this.featureManager = new FeatureManager(GenomeAnalysisEngine.lenientVCFProcessing(validationExclusionType)); - this.disableAutoIndexCreation = disableAutoIndexCreation; - this.sampleRenameMap = sampleRenameMap; - } - - /** - * Return the feature manager this RMDTrackBuilder is using the create tribble tracks - * - * @return - */ - public FeatureManager getFeatureManager() { - return featureManager; - } - - /** - * create a RMDTrack of the specified type - * - * @param fileDescriptor a description of the type of track to build. - * - * @return an instance of the track - */ - public RMDTrack createInstanceOfTrack(RMDTriplet fileDescriptor) { - String name = fileDescriptor.getName(); - File inputFile = new File(fileDescriptor.getFile()); - - FeatureManager.FeatureDescriptor descriptor = getFeatureManager().getByTriplet(fileDescriptor); - if (descriptor == null) - throw new UserException.BadArgumentValue("-B",fileDescriptor.getType()); - - // return a feature reader track - Pair pair; - if (VCFWriterArgumentTypeDescriptor.isCompressed(inputFile.toString())) - pair = createTabixIndexedFeatureSource(descriptor, name, inputFile); - else - pair = getFeatureSource(descriptor, name, inputFile, fileDescriptor.getStorageType()); - if (pair == null) throw new UserException.CouldNotReadInputFile(inputFile, "Unable to make the feature reader for input file"); - return new RMDTrack(descriptor.getCodecClass(), name, inputFile, pair.first, pair.second, genomeLocParser, createCodec(descriptor, name, inputFile)); - } - - /** - * Convenience method simplifying track creation. Assume unnamed track based on a file rather than a stream. - * @param codecClass Type of Tribble codec class to build. - * @param inputFile Input file type to use. - * @return An RMDTrack, suitable for accessing reference metadata. - */ - public RMDTrack createInstanceOfTrack(Class codecClass, File inputFile) { - final FeatureManager.FeatureDescriptor descriptor = getFeatureManager().getByCodec(codecClass); - - if (descriptor == null) - throw new ReviewedGATKException("Unable to find type name for codec class " + codecClass.getName()); - - return createInstanceOfTrack(new RMDTriplet("anonymous",descriptor.getName(),inputFile.getAbsolutePath(),RMDStorageType.FILE,new Tags())); - } - - /** - * create a feature reader, without assuming there exists an index. This code assumes the feature - * reader of the appropriate type will figure out what the right index type is, and determine if it - * exists. - * - * @param descriptor the FeatureDescriptor describing the FeatureCodec we want to create - * @param name the name of the track - * @param inputFile the file to load - * @return a feature reader implementation - */ - private Pair createTabixIndexedFeatureSource(FeatureManager.FeatureDescriptor descriptor, String name, File inputFile) { - // we might not know the index type, try loading with the default reader constructor - logger.debug("Attempting to load " + inputFile + " as a tabix indexed file without validating it"); - try { - // getFeatureReader will detect that it's Tabix - return new Pair<>(AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name, inputFile)), null); - } catch (TribbleException e) { - throw new UserException(e.getMessage(), e); - } - } - - /** - * add a name to the codec, if it takes one - * @param descriptor the class to create a codec for - * @param name the name to assign this codec - * @param inputFile input file that we will be decoding - * @return the feature codec itself - */ - private FeatureCodec createCodec(final FeatureManager.FeatureDescriptor descriptor, final String name, final File inputFile) { - // The remappedSampleName will be null if either no on-the-fly sample renaming was requested, - // or the user's sample rename map file didn't contain an entry for this file: - final String remappedSampleName = sampleRenameMap != null ? sampleRenameMap.get(inputFile.getAbsolutePath()) : null; - - return featureManager.createCodec(descriptor, name, genomeLocParser, remappedSampleName); - } - - /** - * create a feature source object given: - * @param descriptor the FeatureDescriptor describing the FeatureCodec we want to create - * @param name the name of the codec - * @param inputFile the tribble file to parse - * @param storageType How the RMD is streamed into the input file. - * @return the input file as a FeatureReader - */ - private Pair getFeatureSource(FeatureManager.FeatureDescriptor descriptor, - String name, - File inputFile, - RMDStorageType storageType) { - // Feature source and sequence dictionary to use as the ultimate reference - AbstractFeatureReader featureSource = null; - SAMSequenceDictionary sequenceDictionary = null; - - // Detect whether or not this source should be indexed. - boolean canBeIndexed = (storageType == RMDStorageType.FILE); - - if(canBeIndexed) { - try { - Index index = loadIndex(inputFile, createCodec(descriptor, name, inputFile)); - try { logger.info(String.format(" Index for %s has size in bytes %d", inputFile, Sizeof.getObjectGraphSize(index))); } - catch (ReviewedGATKException e) { } - - sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); - - // if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match - if (sequenceDictionary.size() == 0 && dict != null) { - validateAndUpdateIndexSequenceDictionary(inputFile, index, dict); - - if ( ! disableAutoIndexCreation ) { - File indexFile = Tribble.indexFile(inputFile); - try { // re-write the index - writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile)); - } catch (IOException e) { - logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not affect your run of the GATK"); - } - } - - sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); - } - - featureSource = AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name, inputFile), index); - } - catch (TribbleException e) { - throw new UserException(e.getMessage()); - } - catch (IOException e) { - throw new UserException("I/O error loading or writing tribble index file for " + inputFile.getAbsolutePath(), e); - } - } - else { - featureSource = AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name, inputFile), false); - } - - return new Pair(featureSource,sequenceDictionary); - } - - /** - * create an index for the input file - * @param inputFile the input file - * @param codec the codec to use - * @return a linear index for the specified type - * @throws IOException if we cannot write the index file - */ - public synchronized Index loadIndex( final File inputFile, final FeatureCodec codec) throws IOException { - final File indexFile = Tribble.indexFile(inputFile); - final FSLockWithShared lock = new FSLockWithShared(indexFile); - Index idx = null; - - // If the index file exists and is readable, attempt to load it from disk. We'll get null back - // if a problem was discovered with the index file when it was inspected, and we'll get an - // in-memory index back in the case where the index file could not be locked. - if (indexFile.canRead()) { - idx = disableAutoIndexCreation ? loadFromDisk(inputFile, indexFile) // load without locking if we're in disableAutoIndexCreation mode - : attemptToLockAndLoadIndexFromDisk(inputFile, codec, indexFile, lock); - } - - // If we have an index, it means we either loaded it from disk without issue or we created an in-memory - // index due to not being able to acquire a lock. - if (idx != null) return idx; - - // We couldn't read the file, or we discovered a problem with the index file, so continue on to making a new index - idx = createIndexInMemory(inputFile, codec); - if ( ! disableAutoIndexCreation ) { - writeIndexToDisk(idx, indexFile, lock); - } - return idx; - } - - /** - * Attempt to acquire a shared lock and then load the index from disk. Returns an in-memory index if - * a lock could not be obtained. Returns null if a problem was discovered with the index file when it - * was examined (eg., it was out-of-date). - * - * @param inputFile the input file - * @param codec the codec to read from - * @param indexFile the index file itself - * @param lock the lock file - * @return an index, or null if we couldn't load one - * @throws IOException if we fail for FS issues - */ - protected Index attemptToLockAndLoadIndexFromDisk( final File inputFile, final FeatureCodec codec, final File indexFile, final FSLockWithShared lock ) throws IOException { - boolean locked = false; - Index idx = null; - - try { - locked = lock.sharedLock(); - - if ( ! locked ) { // can't lock file - logger.info(String.format("Could not acquire a shared lock on index file %s, falling back to using an in-memory index for this GATK run.", - indexFile.getAbsolutePath())); - idx = createIndexInMemory(inputFile, codec); - } - else { - idx = loadFromDisk(inputFile, indexFile); - } - } finally { - if (locked) lock.unlock(); - } - return idx; - } - - /** - * load the index from disk, checking for out of date indexes and old versions (both of which are deleted) - * @param inputFile the input file - * @param indexFile the input file, plus the index extension - * @return an Index, or null if we're unable to load - */ - protected Index loadFromDisk( final File inputFile, final File indexFile ) { - logger.debug("Loading Tribble index from disk for file " + inputFile); - Index index = IndexFactory.loadIndex(indexFile.getAbsolutePath()); - - // check if the file is up-to date (filestamp and version check) - if (index.isCurrentVersion() && indexFile.lastModified() >= inputFile.lastModified()) - return index; - else if (indexFile.lastModified() < inputFile.lastModified()) - logger.warn("Index file " + indexFile + " is out of date (index older than input file), " + - (disableAutoIndexCreation ? "falling back to an in-memory index" : "deleting and updating the index file")); - else // we've loaded an old version of the index, we want to remove it <-- currently not used, but may re-enable - logger.warn("Index file " + indexFile + " is out of date (old version), " + - (disableAutoIndexCreation ? "falling back to an in-memory index" : "deleting and updating the index file")); - - if ( ! disableAutoIndexCreation ) { - boolean deleted = indexFile.delete(); - if (!deleted) logger.warn("Index file " + indexFile + " is out of date, but could not be removed; it will not be trusted (we'll try to rebuild an in-memory copy)"); - } - - return null; - } - - - /** - * attempt to write the index to disk - * @param index the index to write to disk - * @param indexFile the index file location - * @param lock the locking object - * @throws IOException when unable to create the new index - */ - private void writeIndexToDisk( final Index index, final File indexFile, final FSLockWithShared lock ) throws IOException { - if ( disableAutoIndexCreation ) { - return; - } - - boolean locked = false; - - try { - locked = lock.exclusiveLock(); - - if (locked) { - logger.info("Writing Tribble index to disk for file " + indexFile); - LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile)); - index.write(stream); - stream.close(); - } - else // we can't write it to disk, just store it in memory, tell them this - logger.warn("Unable to write to " + indexFile + " for the index file, creating index in memory only"); - - try { logger.info(String.format(" Index for %s has size in bytes %d", indexFile, Sizeof.getObjectGraphSize(index))); } - catch ( ReviewedGATKException e) { } - } - finally { - if (locked) lock.unlock(); - } - - } - - /** - * create the index in memory, given the input file and feature codec - * @param inputFile the input file - * @param codec the codec - * @return a LinearIndex, given the file location - * @throws IOException when unable to create the index in memory - */ - protected Index createIndexInMemory(File inputFile, FeatureCodec codec) { - // this can take a while, let them know what we're doing - logger.debug("Creating Tribble index in memory for file " + inputFile); - Index idx = IndexFactory.createDynamicIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); - validateAndUpdateIndexSequenceDictionary(inputFile, idx, dict); - return idx; - } - - /** - * set the sequence dictionary of the track. This function checks that the contig listing of the underlying file is compatible. - * (that each contig in the index is in the sequence dictionary). - * @param inputFile for proper error message formatting. - * @param dict the sequence dictionary - * @param index the index file - */ - public void validateAndUpdateIndexSequenceDictionary(final File inputFile, final Index index, final SAMSequenceDictionary dict) { - if (dict == null) throw new ReviewedGATKException("BUG: dict cannot be null"); - - // check that every contig in the RMD contig list is at least in the sequence dictionary we're being asked to set - final SAMSequenceDictionary currentDict = IndexDictionaryUtils.createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary()); - validateTrackSequenceDictionary(inputFile.getAbsolutePath(), currentDict, dict); - - // actually update the dictionary in the index - IndexDictionaryUtils.setIndexSequenceDictionary(index, dict); - } - - public void validateTrackSequenceDictionary(final String trackName, - final SAMSequenceDictionary trackDict, - final SAMSequenceDictionary referenceDict ) { - IndexDictionaryUtils.validateTrackSequenceDictionary(trackName, trackDict, referenceDict, validationExclusionType); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/FeatureToGATKFeatureIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/FeatureToGATKFeatureIterator.java deleted file mode 100644 index 6fb073e12..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/FeatureToGATKFeatureIterator.java +++ /dev/null @@ -1,74 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.utils; - -import htsjdk.samtools.util.CloseableIterator; -import htsjdk.tribble.CloseableTribbleIterator; -import htsjdk.tribble.Feature; -import org.broadinstitute.gatk.utils.GenomeLocParser; - - -/** - * - * @author aaron - * - * Class FeatureToGATKFeatureIterator - * - * a wrapper on Tribble feature iterators so that they produce GATKFeatures (which produce GenomeLocs) - */ -public class FeatureToGATKFeatureIterator implements CloseableIterator { - private final GenomeLocParser genomeLocParser; - private final CloseableTribbleIterator iterator; - private final String name; - - public FeatureToGATKFeatureIterator(GenomeLocParser genomeLocParser,CloseableTribbleIterator iter, String name) { - this.genomeLocParser = genomeLocParser; - this.name = name; - this.iterator = iter; - } - - @Override - public boolean hasNext() { - return iterator.hasNext(); - } - - @Override - public GATKFeature next() { - return new GATKFeature.TribbleGATKFeature(genomeLocParser,iterator.next(),name); - } - - @Override - public void remove() { - throw new UnsupportedOperationException("Why does Iterator have this method? We always throw an exception here"); - } - - @Override - public void close() { - // The private adapted iterator may not be passed on by the method constructing this object, - // leaving only this adapter to close the wrapped iterator. - iterator.close(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/FlashBackIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/FlashBackIterator.java deleted file mode 100644 index 8fc549c00..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/FlashBackIterator.java +++ /dev/null @@ -1,221 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.utils; - -import htsjdk.samtools.SAMSequenceDictionary; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.HasGenomeLocation; - -import java.util.Comparator; -import java.util.LinkedList; - - -/** - * - * @author aaron - * - * Class FlashBackIterator - * - * better than acid washed jeans...more like a Delorean that flies through time - * - * This iterator buffers a certain amount of ROD data to 'flash back' to. This - * is needed for using ROD's in read traversals, because between shards we sometimes - * (actually often) need to go back to before the current iterators location and - * get RODs that overlap the current read. - */ -public class FlashBackIterator implements LocationAwareSeekableRODIterator { - private LocationAwareSeekableRODIterator iterator; - private LinkedList pastQueue = new LinkedList(); - private LinkedList aheadQueue = new LinkedList(); - private int MAX_QUEUE = 200; - - /** - * create a flashback iterator - * @param iterator given a LocationAwareSeekableRODIterator - */ - public FlashBackIterator(LocationAwareSeekableRODIterator iterator) { - this.iterator = iterator; - } - - /** - * Gets the header associated with the backing input stream. - * @return the ROD header. - */ - @Override - public Object getHeader() { - return iterator.getHeader(); - } - - /** - * Gets the sequence dictionary associated with the backing input stream. - * @return sequence dictionary from the ROD header. - */ - @Override - public SAMSequenceDictionary getSequenceDictionary() { - return iterator.getSequenceDictionary(); - } - - - /** - * peek at the next location - * @return - */ - @Override - public GenomeLoc peekNextLocation() { - return (aheadQueue.size() > 0) ? aheadQueue.getFirst().getLocation() : iterator.peekNextLocation(); - } - - /** - * get the position of this iterator - * @return - */ - @Override - public GenomeLoc position() { - return (aheadQueue.size() > 0) ? aheadQueue.getFirst().getLocation() : iterator.position(); - } - - /** - * seek forward on the iterator - * @param interval the interval to seek to - * @return a RODRecordList at that location, null otherwise - */ - @Override - public RODRecordList seekForward(GenomeLoc interval) { - - RODRecordList lt = iterator.seekForward(interval); - createPastRecord(lt); - return lt; - } - - /** - * do we have a next record - * @return true if we have another record - */ - @Override - public boolean hasNext() { - return (aheadQueue.size() > 0 || iterator.hasNext()); - } - - /** - * get the next record - * @return a RODRecordList - */ - @Override - public RODRecordList next() { - return getNext(); - } - - /** - * we don't support remove - */ - @Override - public void remove() { - throw new UnsupportedOperationException("We don't support remove"); - } - - /** - * get the next record, either from the queue or from the iterator - * @return a RODRecordList - */ - private RODRecordList getNext() { - if (aheadQueue.size() > 0) { - RODRecordList ret = aheadQueue.getFirst().getList(); - aheadQueue.removeFirst(); - return ret; - } else { - RODRecordList ret = iterator.next(); - createPastRecord(ret); - return ret; - } - } - - private void createPastRecord(RODRecordList ret) { - ComparableList rec = new ComparableList(ret); - if (rec.getLocation() != null) pastQueue.addLast(new ComparableList(ret)); - if (pastQueue.size() > this.MAX_QUEUE) pastQueue.removeFirst(); - } - - /** - * can we flash back to the specified location? - * - * @param location the location to try and flash back to - * - * @return true if we can, false otherwise - */ - public boolean canFlashBackTo(GenomeLoc location) { - GenomeLoc farthestBack = (pastQueue.size() > 0) ? pastQueue.getFirst().getLocation() : iterator.peekNextLocation(); - return (!farthestBack.isPast(location)); - } - - /** - * flashback! Throws an unsupported operation exception - * - * @param location where to flash back to - */ - public void flashBackTo(GenomeLoc location) { - if (!canFlashBackTo(location)) throw new UnsupportedOperationException("we can't flash back to " + location); - if (pastQueue.size()==0) return; // the iterator can do it alone - while (pastQueue.size() > 0 && !pastQueue.getLast().getLocation().isBefore(location)) { - aheadQueue.addFirst(pastQueue.getLast()); - pastQueue.removeLast(); - } - } - - public void close() { - this.aheadQueue.clear(); - this.pastQueue.clear(); - } -} - -/** - * a list that buffers the location for this rod - */ -class ComparableList implements Comparator, HasGenomeLocation { - private RODRecordList list; - private GenomeLoc location = null; - public ComparableList(RODRecordList list) { - this.list = list; - if (list != null && list.size() != 0) - location = list.getLocation(); - } - - @Override - public int compare(ComparableList list1, ComparableList list2) { - if (list1.location == null && list2.location == null) - return 0; - if (list1.location == null) return 1; - if (list2.location == null) return -1; - return (list1.location.compareTo(list2.location)); - } - - public GenomeLoc getLocation() { - return location; - } - - public RODRecordList getList() { - return list; - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/GATKFeature.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/GATKFeature.java deleted file mode 100644 index 4d08f1bca..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/GATKFeature.java +++ /dev/null @@ -1,109 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.utils; - -import htsjdk.tribble.Feature; -import org.broadinstitute.gatk.engine.refdata.ReferenceOrderedDatum; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.HasGenomeLocation; - - -/** - * - * @author aaron - * - * Class GATKFeature - * - * This wraps a Tribble feature or a RODatum so that both present the same interface: a genome loc for position and a - * way of retrieving the track name. - */ -public abstract class GATKFeature implements Feature, HasGenomeLocation { - - public GATKFeature(String name) { - this.name = name; - } - - String name; - - protected void setName(String name) { - this.name = name; - } - - public String getName() { - return name; - } - - public abstract GenomeLoc getLocation(); - - // TODO: this should be a Feature - public abstract Object getUnderlyingObject(); - - /** - * wrapping a Tribble feature in a GATK friendly interface - */ - public static class TribbleGATKFeature extends GATKFeature { - private final GenomeLocParser genomeLocParser; - private final Feature feature; - private GenomeLoc position = null; - - public TribbleGATKFeature(GenomeLocParser genomeLocParser,Feature f, String name) { - super(name); - this.genomeLocParser = genomeLocParser; - feature = f; - } - public GenomeLoc getLocation() { - if (position == null) position = genomeLocParser.createGenomeLoc(feature.getChr(), feature.getStart(), feature.getEnd()); - return position; - } - - /** Return the features reference sequence name, e.g chromosome or contig */ - @Override - public String getChr() { - return feature.getChr(); - } - - /** Return the start position in 1-based coordinates (first base is 1) */ - @Override - public int getStart() { - return feature.getStart(); - } - - /** - * Return the end position following 1-based fully closed conventions. The length of a feature is - * end - start + 1; - */ - @Override - public int getEnd() { - return feature.getEnd(); - } - - // TODO: this should be a Feature, actually - public Object getUnderlyingObject() { - return feature; - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/LocationAwareSeekableRODIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/LocationAwareSeekableRODIterator.java deleted file mode 100644 index 96c60b9d8..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/LocationAwareSeekableRODIterator.java +++ /dev/null @@ -1,49 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.utils; - -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.util.CloseableIterator; -import org.broadinstitute.gatk.utils.GenomeLoc; - -/** - * @author aaron - *

    - * Interface LocationAwareSeekableRODIterator - *

    - * combine iteration with a position aware interface - */ -public interface LocationAwareSeekableRODIterator extends CloseableIterator { - public Object getHeader(); - - public SAMSequenceDictionary getSequenceDictionary(); - - public GenomeLoc peekNextLocation(); - - public GenomeLoc position(); - - public RODRecordList seekForward(GenomeLoc interval); -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/RMDTriplet.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/RMDTriplet.java deleted file mode 100644 index 9fa3d1e11..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/RMDTriplet.java +++ /dev/null @@ -1,92 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.utils; - - -import org.broadinstitute.gatk.utils.commandline.Tags; - -/** - * a helper class to manage our triplets of data for the -B command line option (name, type, file) - * TODO: The presence of four datapoints here suggests that this class' name isn't sufficient to describe its function. Rename. - */ -public class RMDTriplet { - public enum RMDStorageType { FILE, STREAM }; - - private final String name; - private final String type; - private final String file; - private final RMDStorageType storageType; - private final Tags tags; - - public RMDTriplet(final String name, final String type, final String file, final RMDStorageType storageType, final Tags tags) { - this.name = name; - this.type = type; - this.file = file; - this.storageType = storageType; - this.tags = tags; - } - - /** - * Gets the name of this track. RefMetaDataTrackers can use this identifier to retrieve data of a certain type. - * @return Name associated with this track. - */ - public String getName() { - return name; - } - - /** - * Gets the type of this track. Informs the GATK how to parse this file type. - * @return Type associated with this track. - */ - public String getType() { - return type; - } - - /** - * Gets the filename representing this track. Data is loaded from this file. - * @return Filename of the RMD. - */ - public String getFile() { - return file; - } - - /** - * The type of storage being used for this metadata track. Right now, can be either a - * file type (can be indexed) or a stream type (can't be indexed). - * @return Storage type for this RMD 'triplet'. - */ - public RMDStorageType getStorageType() { - return storageType; - } - - /** - * Gets the key=value tags associated with this track - * @return Tags associated with this track. - */ - public Tags getTags() { - return tags; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/RODRecordList.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/RODRecordList.java deleted file mode 100644 index b859edc10..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/RODRecordList.java +++ /dev/null @@ -1,45 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.utils; - -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.HasGenomeLocation; - -import java.util.List; - - -/** - * @author aaron - *

    - * Class RODRecordList - *

    - * make the RODRecord list an interface, so we can stub in other implementations - * during testing. - */ -public interface RODRecordList extends List, Comparable, HasGenomeLocation { - public GenomeLoc getLocation(); - public String getName(); -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReport.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReport.java deleted file mode 100644 index 660ea95c1..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReport.java +++ /dev/null @@ -1,376 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.report; - -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.io.*; -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.TreeMap; - -/** - * Container class for GATK report tables - */ -public class GATKReport { - public static final String GATKREPORT_HEADER_PREFIX = "#:GATKReport."; - public static final GATKReportVersion LATEST_REPORT_VERSION = GATKReportVersion.V1_1; - private static final String SEPARATOR = ":"; - private GATKReportVersion version = LATEST_REPORT_VERSION; - - private final TreeMap tables = new TreeMap(); - - /** - * Create a new, empty GATKReport. - */ - public GATKReport() { - } - - /** - * Create a new GATKReport with the contents of a GATKReport on disk. - * - * @param filename the path to the file to load - */ - public GATKReport(String filename) { - this(new File(filename)); - } - - /** - * Create a new GATKReport with the contents of a GATKReport on disk. - * - * @param file the file to load - */ - public GATKReport(File file) { - loadReport(file); - } - - /** - * Create a new GATK report from GATK report tables - * @param tables Any number of tables that you want to add to the report - */ - public GATKReport(GATKReportTable... tables) { - for( GATKReportTable table: tables) - addTable(table); - } - - /** - * Load a GATKReport file from disk - * - * @param file the file to load - */ - private void loadReport(File file) { - BufferedReader reader; - String reportHeader; - try { - reader = new BufferedReader(new FileReader(file)); - reportHeader = reader.readLine(); - } catch (FileNotFoundException e) { - throw new UserException.CouldNotReadInputFile(file, "it does not exist"); - } catch (IOException e) { - throw new UserException.CouldNotReadInputFile(file, e); - } - - - // Read the first line for the version and number of tables. - version = GATKReportVersion.fromHeader(reportHeader); - if (version.equals(GATKReportVersion.V0_1) || - version.equals(GATKReportVersion.V0_2)) - throw new UserException("The GATK no longer supports reading legacy GATK Reports. Please use v1.0 or newer."); - - int nTables = Integer.parseInt(reportHeader.split(":")[2]); - - // Read each table according ot the number of tables - for (int i = 0; i < nTables; i++) { - addTable(new GATKReportTable(reader, version)); - } - } - - /** - * Add a new, empty table to the report - * - * @param tableName the name of the table - * @param tableDescription the description of the table - * @param numColumns the number of columns in this table - */ - public void addTable(final String tableName, final String tableDescription, final int numColumns) { - addTable(tableName, tableDescription, numColumns, GATKReportTable.TableSortingWay.DO_NOT_SORT); - } - - /** - * Add a new, empty table to the report - * - * @param tableName the name of the table - * @param tableDescription the description of the table - * @param numColumns the number of columns in this table - * @param sortingWay way to sort table - */ - public void addTable(final String tableName, final String tableDescription, final int numColumns, final GATKReportTable.TableSortingWay sortingWay) { - GATKReportTable table = new GATKReportTable(tableName, tableDescription, numColumns, sortingWay); - tables.put(tableName, table); - } - - /** - * Adds a table, empty or populated, to the report - * - * @param table the table to add - */ - public void addTable(GATKReportTable table) { - tables.put(table.getTableName(), table); - } - - public void addTables(List gatkReportTableV2s) { - for ( GATKReportTable table : gatkReportTableV2s ) - addTable(table); - } - - /** - * Return true if table with a given name exists - * - * @param tableName the name of the table - * @return true if the table exists, false otherwise - */ - public boolean hasTable(String tableName) { - return tables.containsKey(tableName); - } - - /** - * Return a table with a given name - * - * @param tableName the name of the table - * @return the table object - */ - public GATKReportTable getTable(String tableName) { - GATKReportTable table = tables.get(tableName); - if (table == null) - throw new ReviewedGATKException("Table is not in GATKReport: " + tableName); - return table; - } - - /** - * Print all tables contained within this container to a PrintStream - * - * @param out the PrintStream to which the tables should be written - */ - public void print(PrintStream out) { - out.println(GATKREPORT_HEADER_PREFIX + getVersion().toString() + SEPARATOR + getTables().size()); - for (GATKReportTable table : tables.values()) - table.write(out); - } - - public Collection getTables() { - return tables.values(); - } - - /** - * This is the main function is charge of gathering the reports. It checks that the reports are compatible and then - * calls the table gathering functions. - * - * @param input another GATKReport of the same format - */ - public void concat(GATKReport input) { - - if ( !isSameFormat(input) ) { - throw new ReviewedGATKException("Failed to combine GATKReport, format doesn't match!"); - } - - for ( Map.Entry table : tables.entrySet() ) { - table.getValue().concat(input.getTable(table.getKey())); - } - } - - public GATKReportVersion getVersion() { - return version; - } - - /** - * Returns whether or not the two reports have the same format, from columns, to tables, to reports, and everything - * in between. This does not check if the data inside is the same. This is the check to see if the two reports are - * gatherable or reduceable. - * - * @param report another GATK report - * @return true if the the reports are gatherable - */ - public boolean isSameFormat(GATKReport report) { - if (!version.equals(report.version)) { - return false; - } - if (!tables.keySet().equals(report.tables.keySet())) { - return false; - } - for (String tableName : tables.keySet()) { - if (!getTable(tableName).isSameFormat(report.getTable(tableName))) - return false; - } - return true; - } - - /** - * Checks that the reports are exactly the same. - * - * @param report another GATK report - * @return true if all field in the reports, tables, and columns are equal. - */ - public boolean equals(GATKReport report) { - if (!version.equals(report.version)) { - return false; - } - if (!tables.keySet().equals(report.tables.keySet())) { - return false; - } - for (String tableName : tables.keySet()) { - if (!getTable(tableName).equals(report.getTable(tableName))) - return false; - } - return true; - } - - /** - * The constructor for a simplified GATK Report. Simplified GATK report are designed for reports that do not need - * the advanced functionality of a full GATK Report. - *

    - * A simple GATK Report consists of: - *

    - * - A single table - * - No primary key ( it is hidden ) - *

    - * Optional: - * - Only untyped columns. As long as the data is an Object, it will be accepted. - * - Default column values being empty strings. - *

    - * Limitations: - *

    - * - A simple GATK report cannot contain multiple tables. - * - It cannot contain typed columns, which prevents arithmetic gathering. - * - * @param tableName The name of your simple GATK report table - * @param columns The names of the columns in your table - * @return a simplified GATK report - */ - public static GATKReport newSimpleReport(final String tableName, final String... columns) { - return newSimpleReportWithDescription(tableName, "A simplified GATK table report", columns); - } - - /** - * @see #newSimpleReport(String, String...) but with a customized description - * @param tableName - * @param desc - * @param columns - * @return - */ - public static GATKReport newSimpleReportWithDescription(final String tableName, final String desc, final String... columns) { - GATKReportTable table = new GATKReportTable(tableName, desc, columns.length); - - for (String column : columns) { - table.addColumn(column, ""); - } - - GATKReport output = new GATKReport(); - output.addTable(table); - - return output; - } - - /** - * The constructor for a simplified GATK Report. Simplified GATK report are designed for reports that do not need - * the advanced functionality of a full GATK Report. - *

    - * A simple GATK Report consists of: - *

    - * - A single table - * - No primary key ( it is hidden ) - *

    - * Optional: - * - Only untyped columns. As long as the data is an Object, it will be accepted. - * - Default column values being empty strings. - *

    - * Limitations: - *

    - * - A simple GATK report cannot contain multiple tables. - * - It cannot contain typed columns, which prevents arithmetic gathering. - * - * @param tableName The name of your simple GATK report table - * @param columns The names of the columns in your table - * @return a simplified GATK report - */ - public static GATKReport newSimpleReport(final String tableName, final List columns) { - GATKReportTable table = new GATKReportTable(tableName, "A simplified GATK table report", columns.size()); - - for (String column : columns) { - table.addColumn(column, ""); - } - - GATKReport output = new GATKReport(); - output.addTable(table); - - return output; - } - - /** - * This method provides an efficient way to populate a simplified GATK report. This method will only work on reports - * that qualify as simplified GATK reports. See the newSimpleReport() constructor for more information. - * - * @param values the row of data to be added to the table. - * Note: the number of arguments must match the columns in the table. - */ - public void addRow(final Object... values) { - // Must be a simple report - if ( tables.size() != 1 ) - throw new ReviewedGATKException("Cannot write a row to a complex GATK Report"); - - GATKReportTable table = tables.firstEntry().getValue(); - if ( table.getNumColumns() != values.length ) - throw new ReviewedGATKException("The number of arguments in writeRow (" + values.length + ") must match the number of columns in the table (" + table.getNumColumns() + ")" ); - - final int rowIndex = table.getNumRows(); - for ( int i = 0; i < values.length; i++ ) - table.set(rowIndex, i, values[i]); - } - - /** - * This method provides an efficient way to populate a simplified GATK report. This method will only work on reports - * that qualify as simplified GATK reports. See the newSimpleReport() constructor for more information. - * - * @param values the row of data to be added to the table. - * Note: the number of arguments must match the columns in the table. - */ - public void addRowList(final List values) { - if ( tables.size() != 1 ) - throw new ReviewedGATKException("Cannot write a row to a complex GATK Report"); - - GATKReportTable table = tables.firstEntry().getValue(); - if ( table.getNumColumns() != values.size() ) - throw new ReviewedGATKException("The number of arguments in writeRow() must match the number of columns in the table"); - - final int rowIndex = table.getNumRows(); - int idx = 0; - for ( Object value : values ) { - table.set(rowIndex,idx,value); - idx++; - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportColumn.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportColumn.java deleted file mode 100644 index ffdefff36..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportColumn.java +++ /dev/null @@ -1,147 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.report; - -import org.apache.commons.lang.math.NumberUtils; - -import java.util.Arrays; -import java.util.Collection; - -/** - * column information within a GATK report table - */ -public class GATKReportColumn { - final private String columnName; - final private String format; - final private GATKReportDataType dataType; - - private GATKReportColumnFormat columnFormat; - private GATKReportColumnFormat.Alignment alignment = GATKReportColumnFormat.Alignment.RIGHT; // default alignment is to the right unless values added ask for a left alignment - private int maxWidth = 0; - - /** - * Construct the column object, specifying the column name, default value, whether or not the column should be - * displayed, and the format string. This cannot be null. - * - * @param columnName the name of the column - * @param format format string - */ - public GATKReportColumn(final String columnName, final String format) { - this.columnName = columnName; - this.maxWidth = columnName.length(); - if ( format.equals("") ) { - this.format = "%s"; - this.dataType = GATKReportDataType.Unknown; - } - else { - this.format = format; - this.dataType = GATKReportDataType.fromFormatString(format); - } - } - - /** - * Get the display width for this column. This allows the entire column to be displayed with the appropriate, fixed - * width. - * - * @return the format string for this column - */ - public GATKReportColumnFormat getColumnFormat() { - if (columnFormat != null) - return columnFormat; - - columnFormat = new GATKReportColumnFormat(maxWidth, alignment); - return columnFormat; - } - - private static final Collection RIGHT_ALIGN_STRINGS = Arrays.asList( - "null", - "NA", - String.valueOf(Double.POSITIVE_INFINITY), - String.valueOf(Double.NEGATIVE_INFINITY), - String.valueOf(Double.NaN)); - - /** - * Check if the value can be right aligned. Does not trim the values before checking if numeric since it assumes - * the spaces mean that the value is already padded. - * - * @param value to check - * @return true if the value is a right alignable - */ - protected static boolean isRightAlign(final String value) { - return value == null || RIGHT_ALIGN_STRINGS.contains(value) || NumberUtils.isNumber(value.trim()); - } - - /** - * Returns a string version of the values. - * - * @param obj The object to convert to a string - * @return The string representation of the column - */ - private String formatValue(final Object obj) { - String value; - if (obj == null) { - value = "null"; - } - else if ( dataType.equals(GATKReportDataType.Unknown) && (obj instanceof Double || obj instanceof Float) ) { - value = String.format("%.8f", obj); - } - else - value = String.format(format, obj); - - return value; - } - - public GATKReportDataType getDataType() { - return dataType; - } - - public String getColumnName() { - return columnName; - } - - public String getFormat() { - return dataType.equals(GATKReportDataType.Unknown) ? "%s" : format; - } - - public void updateFormatting(final Object value) { - if (value != null) { - final String formatted = formatValue(value); - if ( formatted.length() > 0 ) { - updateMaxWidth(formatted); - updateFormat(formatted); - } - } - } - - private void updateMaxWidth(final String formatted) { - maxWidth = Math.max(formatted.length(), maxWidth); - } - - private void updateFormat(final String formatted) { - if (alignment == GATKReportColumnFormat.Alignment.RIGHT) - alignment = isRightAlign(formatted) ? GATKReportColumnFormat.Alignment.RIGHT : GATKReportColumnFormat.Alignment.LEFT; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportColumnFormat.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportColumnFormat.java deleted file mode 100644 index 664b503b0..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportColumnFormat.java +++ /dev/null @@ -1,63 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.report; - -/** - * Column width and left/right alignment. - */ -public class GATKReportColumnFormat { - public static enum Alignment { LEFT, RIGHT } - private final int width; - private final Alignment alignment; - - public GATKReportColumnFormat(int width, Alignment alignment) { - this.width = width; - this.alignment = alignment; - } - - public int getWidth() { - return width; - } - - public Alignment getAlignment() { - return alignment; - } - - public String getNameFormat() { - return "%-" + width + "s"; - } - - public String getValueFormat() { - switch (alignment) { - case LEFT: - return "%-" + width + "s"; - case RIGHT: - return "%" + width + "s"; - default: - throw new UnsupportedOperationException("Unknown alignment: " + alignment); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportDataType.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportDataType.java deleted file mode 100644 index acfa74f25..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportDataType.java +++ /dev/null @@ -1,236 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.report; - -import java.util.EnumSet; -import java.util.HashMap; -import java.util.Map; - -/** - * The gatherable data types acceptable in a GATK report column. - */ -public enum GATKReportDataType { - /** - * The null type should not be used. - */ - Null("Null"), - - /** - * The default value when a format string is not present - */ - Unknown("Unknown"), - - /** - * Used for boolean values. Will display as true or false in the table. - */ - Boolean("%[Bb]"), - - /** - * Used for char values. Will display as a char so use printable values! - */ - Character("%[Cc]"), - - /** - * Used for float and double values. Will output a decimal with format %.8f unless otherwise specified. - */ - Decimal("%.*[EeFf]"), - - /** - * Used for int, byte, short, and long values. Will display the full number by default. - */ - Integer("%[Dd]"), - - /** - * Used for string values. Displays the string itself. - */ - String("%[Ss]"); - - private final String dataTypeString; - - private GATKReportDataType(String dataTypeString) { - this.dataTypeString = dataTypeString; - } - - private static final Map lookup = new HashMap(); - - static { - for (GATKReportDataType s : EnumSet.allOf(GATKReportDataType.class)) - lookup.put(s.dataTypeString, s); - } - - - @Override - public String toString() { - return this.dataTypeString; - } - - /** - * Returns a GATK report data type from the Object specified. It looks through the list of acceptable classes and - * returns the appropriate data type. - * - * @param object the object ot derive the data type from - * @return the appropriate data type - */ - public static GATKReportDataType fromObject(Object object) { - GATKReportDataType value; - if (object instanceof Boolean) { - value = GATKReportDataType.Boolean; - - } else if (object instanceof Character) { - value = GATKReportDataType.Character; - - } else if (object instanceof Float || - object instanceof Double) { - value = GATKReportDataType.Decimal; - - } else if (object instanceof Integer || - object instanceof Long || - object instanceof Short || - object instanceof Byte ) { - value = GATKReportDataType.Integer; - - } else if (object instanceof String) { - value = GATKReportDataType.String; - - } else { - value = GATKReportDataType.Unknown; - //throw new UserException("GATKReport could not convert the data object into a GATKReportDataType. Acceptable data objects are found in the documentation."); - } - return value; - } - - /** - * Returns a GATK report data type from the format string specified. It uses regex matching from the enumerated - * Strings. - * - * @param format the format string to derive the data type from - * @return the appropriate data type - */ - public static GATKReportDataType fromFormatString(String format) { - if (format.equals("")) - return Unknown; - for (GATKReportDataType type : lookup.values()) { - if (format.matches(type.toString()) ) - return type; - } - return Unknown; - } - - /** - * Returns the default value of the data type. It returns an object that matches the class of the data type. - * - * @return an object that matches the data type - */ - public Object getDefaultValue() { - switch (this) { - case Decimal: - return 0.0D; - case Boolean: - return false; - case Character: - return '0'; - case Integer: - return 0L; - case String: - return ""; - default: - return null; - } - } - - /** - * Checks if the two objects are equal using the appropriate test form the data types. - * - * @param a an object - * @param b another object to check if equal - * @return true - the objects are equal, false - the objects are nto equal - */ - public boolean isEqual(Object a, Object b) { - switch (this) { - case Null: - return true; - case Decimal: - case Boolean: - case Integer: - return a.toString().equals(b.toString()); - case Character: - case String: - default: - return a.equals(b); - } - } - - /** - * Converts an input String to the appropriate type using the data type. Used for parsing loading a GATK report from - * file. - * - * @param obj The input string - * @return an object that matches the data type. - */ - Object Parse(Object obj) { - if (obj instanceof String) { - String str = obj.toString(); - switch (this) { - case Decimal: - return Double.parseDouble(str); - case Boolean: - return java.lang.Boolean.parseBoolean(str); - case Integer: - return Long.parseLong(str); - case String: - return str; - case Character: - return str.toCharArray()[0]; - default: - return str; - } - } else - return null; - } - - /** - * Returns a format string version of the value according to the data type. - * - * @return The printf string representation of the object according to data type. - */ - public String getDefaultFormatString() { - switch (this) { - case Decimal: - return "%.8f"; - case Boolean: - return "%b"; - case Integer: - return "%d"; - case String: - return "%s"; - case Character: - return "%c"; - case Null: - default: - return "%s"; - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportGatherer.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportGatherer.java deleted file mode 100644 index 5f7f7670c..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportGatherer.java +++ /dev/null @@ -1,62 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.report; - -import org.broadinstitute.gatk.utils.commandline.Gatherer; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.PrintStream; -import java.util.List; - -public class GATKReportGatherer extends Gatherer { - @Override - public void gather(List inputs, File output) { - //Combines inputs GATKReport to one output - - PrintStream o; - try { - o = new PrintStream(output); - } catch (FileNotFoundException e) { - throw new UserException(String.format("File %s to be output by GATKReportGatherer function was not found", output)); - } - - GATKReport current = new GATKReport(); - boolean isFirst = true; - for (File input : inputs) { - if (isFirst) { - current = new GATKReport(input); - isFirst = false; - } else { - current.concat(new GATKReport(input)); - } - } - - current.print(o); - o.close(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportTable.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportTable.java deleted file mode 100644 index 6a1e456d4..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportTable.java +++ /dev/null @@ -1,779 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.report; - -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.text.TextFormattingUtils; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.PrintStream; -import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -public class GATKReportTable { - /** - * REGEX that matches any table with an invalid name - */ - public static final String INVALID_TABLE_NAME_REGEX = "[^a-zA-Z0-9_\\-\\.]"; - private static final String GATKTABLE_HEADER_PREFIX = "#:GATKTable"; - private static final String SEPARATOR = ":"; - private static final String ENDLINE = ":;"; - - private final String tableName; - private final String tableDescription; - - private final TableSortingWay sortingWay; - - private List underlyingData; - private final List columnInfo; - private final Map columnNameToIndex; - private final HashMap rowIdToIndex; - - private static final String COULD_NOT_READ_HEADER = "Could not read the header of this file -- "; - private static final String COULD_NOT_READ_COLUMN_NAMES = "Could not read the column names of this file -- "; - private static final String COULD_NOT_READ_DATA_LINE = "Could not read a data line of this table -- "; - private static final String COULD_NOT_READ_EMPTY_LINE = "Could not read the last empty line of this table -- "; - private static final String OLD_GATK_TABLE_VERSION = "We no longer support older versions of the GATK Tables"; - - private static final int INITITAL_ARRAY_SIZE = 10000; - private static final String NUMBER_CONVERSION_EXCEPTION = "String is a number but is not a long or a double: "; - - protected enum TableDataHeaderFields { - COLS(2), - ROWS(3), - FORMAT_START(4); - - private final int index; - TableDataHeaderFields(int index) { this.index = index; } - public int index() { return index; } - } - - public enum TableSortingWay { - SORT_BY_ROW, - SORT_BY_COLUMN, - DO_NOT_SORT - } - - protected enum TableNameHeaderFields { - NAME(2), - DESCRIPTION(3); - - private final int index; - TableNameHeaderFields(int index) { this.index = index; } - public int index() { return index; } - } - - /** - * Construct a new GATK report table from the reader - * Note that the row ID mappings are just the index -> index - * - * @param reader the reader - * @param version the GATK report version - */ - public GATKReportTable(BufferedReader reader, GATKReportVersion version) { - - switch ( version ) { - case V1_1: - // read in the header lines - final String[] tableData, tableNameData; - try { - tableData = reader.readLine().split(SEPARATOR); - tableNameData = reader.readLine().split(SEPARATOR); - } catch (IOException e) { - throw new ReviewedGATKException(COULD_NOT_READ_HEADER + e.getMessage()); - } - - // parse the header fields - tableName = tableNameData[TableNameHeaderFields.NAME.index()]; - tableDescription = (tableNameData.length <= TableNameHeaderFields.DESCRIPTION.index()) ? "" : tableNameData[TableNameHeaderFields.DESCRIPTION.index()]; // table may have no description! (and that's okay) - - // when reading from a file, we do not re-sort the rows - sortingWay = TableSortingWay.DO_NOT_SORT; - - // initialize the data - final int nColumns = Integer.parseInt(tableData[TableDataHeaderFields.COLS.index()]); - final int nRows = Integer.parseInt(tableData[TableDataHeaderFields.ROWS.index()]); - underlyingData = new ArrayList(nRows); - columnInfo = new ArrayList(nColumns); - columnNameToIndex = new HashMap(nColumns); - - // when reading from a file, the row ID mapping is just the index - rowIdToIndex = new HashMap(); - for ( int i = 0; i < nRows; i++ ) - rowIdToIndex.put(i, i); - - // read the column names - final String columnLine; - try { - columnLine = reader.readLine(); - } catch (IOException e) { - throw new ReviewedGATKException(COULD_NOT_READ_COLUMN_NAMES); - } - - final List columnStarts = TextFormattingUtils.getWordStarts(columnLine); - final String[] columnNames = TextFormattingUtils.splitFixedWidth(columnLine, columnStarts); - - // Put in columns using the format string from the header - for ( int i = 0; i < nColumns; i++ ) { - final String format = tableData[TableDataHeaderFields.FORMAT_START.index() + i]; - addColumn(columnNames[i], format); - } - - // fill in the table - try { - for ( int i = 0; i < nRows; i++ ) { - // read a data line - final String dataLine = reader.readLine(); - final List lineSplits = Arrays.asList(TextFormattingUtils.splitFixedWidth(dataLine, columnStarts)); - - underlyingData.add(new Object[nColumns]); - for ( int columnIndex = 0; columnIndex < nColumns; columnIndex++ ) { - - final GATKReportDataType type = columnInfo.get(columnIndex).getDataType(); - final String columnName = columnNames[columnIndex]; - set(i, columnName, type.Parse(lineSplits.get(columnIndex))); - - } - } - } catch (IOException e) { - throw new ReviewedGATKException(COULD_NOT_READ_DATA_LINE + e.getMessage()); - } - - try { - reader.readLine(); - } catch (IOException e) { - throw new ReviewedGATKException(COULD_NOT_READ_EMPTY_LINE + e.getMessage()); - } - break; - - default: - throw new ReviewedGATKException(OLD_GATK_TABLE_VERSION); - } - } - - /** - * Construct a new GATK report table with the specified name and description - * - * @param tableName the name of the table - * @param tableDescription the description of the table - * @param numColumns the number of columns in this table - */ - public GATKReportTable(final String tableName, final String tableDescription, final int numColumns) { - this(tableName, tableDescription, numColumns, TableSortingWay.SORT_BY_ROW); - } - - /** - * Construct a new GATK report table with the specified name and description and whether to sort rows by the row ID. - * - * @param tableName the name of the table - * @param tableDescription the description of the table - * @param numColumns the number of columns in this table - * @param sortingWay in what way to sort rows (instead of the order in which they were added) - */ - public GATKReportTable(final String tableName, final String tableDescription, final int numColumns, final TableSortingWay sortingWay) { - if ( !isValidName(tableName) ) { - throw new ReviewedGATKException("Attempted to set a GATKReportTable name of '" + tableName + "'. GATKReportTable names must be purely alphanumeric - no spaces or special characters are allowed."); - } - - if ( !isValidDescription(tableDescription) ) { - throw new ReviewedGATKException("Attempted to set a GATKReportTable description of '" + tableDescription + "'. GATKReportTable descriptions must not contain newlines."); - } - - this.tableName = tableName; - this.tableDescription = tableDescription; - this.sortingWay = sortingWay; - - underlyingData = new ArrayList(INITITAL_ARRAY_SIZE); - columnInfo = new ArrayList(numColumns); - columnNameToIndex = new HashMap(numColumns); - rowIdToIndex = new HashMap(); - } - - /** - * Create a new GATKReportTable with the same structure - * @param tableToCopy - */ - public GATKReportTable(final GATKReportTable tableToCopy, final boolean copyData) { - this(tableToCopy.getTableName(), tableToCopy.getTableDescription(), tableToCopy.getNumColumns(), tableToCopy.sortingWay); - for ( final GATKReportColumn column : tableToCopy.getColumnInfo() ) - addColumn(column.getColumnName(), column.getFormat()); - if ( copyData ) - throw new IllegalArgumentException("sorry, copying data in GATKReportTable isn't supported"); - } - - /** - * Verifies that a table or column name has only alphanumeric characters - no spaces or special characters allowed - * - * @param name the name of the table or column - * @return true if the name is valid, false if otherwise - */ - private boolean isValidName(String name) { - Pattern p = Pattern.compile(INVALID_TABLE_NAME_REGEX); - Matcher m = p.matcher(name); - - return !m.find(); - } - - /** - * Verifies that a table or column name has only alphanumeric characters - no spaces or special characters allowed - * - * @param description the name of the table or column - * @return true if the name is valid, false if otherwise - */ - private boolean isValidDescription(String description) { - Pattern p = Pattern.compile("\\r|\\n"); - Matcher m = p.matcher(description); - - return !m.find(); - } - - /** - * Add a mapping from ID to the index of a new row added to the table. - * - * @param ID the unique ID - */ - public void addRowID(final String ID) { - addRowID(ID, false); - } - - /** - * Add a mapping from ID to the index of a new row added to the table. - * - * @param ID the unique ID - * @param populateFirstColumn should we automatically populate the first column with the row's ID? - */ - public void addRowID(final String ID, final boolean populateFirstColumn) { - addRowIDMapping(ID, underlyingData.size(), populateFirstColumn); - } - - /** - * Add a mapping from ID to row index. - * - * @param ID the unique ID - * @param index the index associated with the ID - */ - public void addRowIDMapping(final String ID, final int index) { - addRowIDMapping(ID, index, false); - } - - /** - * Add a mapping from ID to row index. - * - * @param ID the unique ID - * @param index the index associated with the ID - * @param populateFirstColumn should we automatically populate the first column with the row's ID? - */ - public void addRowIDMapping(final Object ID, final int index, final boolean populateFirstColumn) { - expandTo(index, false); - rowIdToIndex.put(ID, index); - - if ( populateFirstColumn ) - set(index, 0, ID); - } - - /** - * Remove a mapping from ID to row index. - * - * @param ID the row ID - */ - public void removeRowIDMapping(final Object ID) { - rowIdToIndex.remove(ID); - } - - /** - * Add a column to the report - * - * @param columnName the name of the column - */ - public void addColumn(String columnName) { - addColumn(columnName, ""); - } - - /** - * Add a column to the report and the format string used to display the data. - * - * @param columnName the name of the column - * @param format the format string used to display data - */ - public void addColumn(String columnName, String format) { - columnNameToIndex.put(columnName, columnInfo.size()); - columnInfo.add(new GATKReportColumn(columnName, format)); - } - - /** - * Check if the requested cell is valid and expand the table if necessary - * - * @param rowIndex the row index - * @param colIndex the column index - */ - private void verifyEntry(final int rowIndex, final int colIndex) { - if ( rowIndex < 0 || colIndex < 0 || colIndex >= getNumColumns() ) - throw new ReviewedGATKException("attempted to access a cell that does not exist in table '" + tableName + "'"); - } - - /** - * expand the underlying table if needed to include the given row index - * - * @param rowIndex the row index - * @param updateRowIdMap should we update the row ID map? - */ - private void expandTo(final int rowIndex, final boolean updateRowIdMap) { - int currentSize = underlyingData.size(); - if ( rowIndex >= currentSize ) { - final int numNewRows = rowIndex - currentSize + 1; - for ( int i = 0; i < numNewRows; i++ ) { - if ( updateRowIdMap ) - rowIdToIndex.put(currentSize, currentSize); - underlyingData.add(new Object[getNumColumns()]); - currentSize++; - } - } - } - - /** - * Set the value for a given position in the table. - * If the row ID doesn't exist, it will create a new row in the table with the given ID. - * - * @param rowID the row ID - * @param columnName the name of the column - * @param value the value to set - */ - public void set(final Object rowID, final String columnName, final Object value) { - if ( !rowIdToIndex.containsKey(rowID) ) { - rowIdToIndex.put(rowID, underlyingData.size()); - expandTo(underlyingData.size(), false); - } - set(rowIdToIndex.get(rowID), columnNameToIndex.get(columnName), value); - } - - /** - * Set the value for a given position in the table. - * If the row index doesn't exist, it will create new rows in the table accordingly. - * - * @param rowIndex the row index - * @param colIndex the column index - * @param value the value to set - */ - public void set(final int rowIndex, final int colIndex, Object value) { - expandTo(rowIndex, true); - verifyEntry(rowIndex, colIndex); - GATKReportColumn column = columnInfo.get(colIndex); - - // We do not accept internal null values - if (value == null) - value = "null"; - else - value = fixType(value, column); - - if ( column.getDataType().equals(GATKReportDataType.fromObject(value)) || column.getDataType().equals(GATKReportDataType.Unknown) ) { - underlyingData.get(rowIndex)[colIndex] = value; - column.updateFormatting(value); - } else { - throw new ReviewedGATKException(String.format("Tried to add an object of type: %s to a column of type: %s", GATKReportDataType.fromObject(value).name(), column.getDataType().name())); - } - } - - /** - * Returns true if the table contains a row mapping with the given ID - * - * @param rowID the row ID - */ - public boolean containsRowID(final Object rowID) { - return rowIdToIndex.containsKey(rowID); - } - - /** - * Returns the row mapping IDs - * - */ - public Collection getRowIDs() { - return rowIdToIndex.keySet(); - } - - /** - * Increment the value for a given position in the table. - * Throws an exception if the value in the cell is not an integer. - * - * @param rowID the row ID - * @param columnName the name of the column - */ - public void increment(final Object rowID, final String columnName) { - int prevValue; - if ( !rowIdToIndex.containsKey(rowID) ) { - rowIdToIndex.put(rowID, underlyingData.size()); - underlyingData.add(new Object[getNumColumns()]); - prevValue = 0; - } else { - Object obj = get(rowID, columnName); - if ( !(obj instanceof Integer) ) - throw new ReviewedGATKException("Attempting to increment a value in a cell that is not an integer"); - prevValue = (Integer)obj; - } - - set(rowIdToIndex.get(rowID), columnNameToIndex.get(columnName), prevValue + 1); - } - - /** - * Returns the index of the first row matching the column values. - * Ex: "CountVariants", "dbsnp", "eval", "called", "all", "novel", "all" - * - * @param columnValues column values. - * @return The index of the first row matching the column values or -1 if no such row exists. - */ - public int findRowByData(final Object... columnValues) { - if ( columnValues == null || columnValues.length == 0 || columnValues.length > getNumColumns() ) - return -1; - - for ( int rowIndex = 0; rowIndex < underlyingData.size(); rowIndex++ ) { - - final Object[] row = underlyingData.get(rowIndex); - - boolean matches = true; - for ( int colIndex = 0; colIndex < columnValues.length; colIndex++ ) { - if ( !columnValues[colIndex].equals(row[colIndex]) ) { - matches = false; - break; - } - } - - if ( matches ) - return rowIndex; - } - - return -1; - } - - private Object fixType(final Object value, final GATKReportColumn column) { - // Below is some code to convert a string into its appropriate type. - - // todo -- Types have to be more flexible. For example, %d should accept Integers, Shorts and Bytes. - - Object newValue = null; - if ( value instanceof String && !column.getDataType().equals(GATKReportDataType.String) ) { - // Integer case - if ( column.getDataType().equals(GATKReportDataType.Integer) ) { - try { - newValue = Long.parseLong((String) value); - } catch (Exception e) { - /** do nothing */ - } - } - if ( column.getDataType().equals(GATKReportDataType.Decimal) ) { - try { - newValue = Double.parseDouble((String) value); - } catch (Exception e) { - /** do nothing */ - } - } - if ( column.getDataType().equals(GATKReportDataType.Character) && ((String) value).length() == 1 ) { - newValue = ((String) value).charAt(0); - } - } - - return (newValue != null) ? newValue : value; - } - - /** - * Get a value from the given position in the table - * - * @param rowID the row ID - * @param columnName the name of the column - * @return the value stored at the specified position in the table - */ - public Object get(final Object rowID, final String columnName) { - return get(rowIdToIndex.get(rowID), columnNameToIndex.get(columnName)); - } - - /** - * Get a value from the given position in the table - * - * @param rowIndex the row ID - * @param columnName the name of the column - * @return the value stored at the specified position in the table - */ - public Object get(final int rowIndex, final String columnName) { - return get(rowIndex, columnNameToIndex.get(columnName)); - } - - /** - * Get a value from the given position in the table - * - * @param rowIndex the index of the row - * @param columnIndex the index of the column - * @return the value stored at the specified position in the table - */ - public Object get(int rowIndex, int columnIndex) { - verifyEntry(rowIndex, columnIndex); - return underlyingData.get(rowIndex)[columnIndex]; - } - - /** - * Write the table to the PrintStream, formatted nicely to be human-readable, AWK-able, and R-friendly. - * - * @param out the PrintStream to which the table should be written - */ - void write(final PrintStream out) { - - /* - * Table header: - * #:GATKTable:nColumns:nRows:(DataType for each column):; - * #:GATKTable:TableName:Description :; - * key colA colB - * row1 xxxx xxxxx - */ - - // write the table definition - out.printf(GATKTABLE_HEADER_PREFIX + ":%d:%d", getNumColumns(), getNumRows()); - - // write the formats for all the columns - for ( final GATKReportColumn column : columnInfo ) - out.print(SEPARATOR + column.getFormat()); - out.println(ENDLINE); - - // write the table name & description - out.printf(GATKTABLE_HEADER_PREFIX + ":%s:%s\n", tableName, tableDescription); - - // write the column names - boolean needsPadding = false; - for ( final GATKReportColumn column : columnInfo ) { - if ( needsPadding ) - out.printf(" "); - needsPadding = true; - - out.printf(column.getColumnFormat().getNameFormat(), column.getColumnName()); - } - out.println(); - - // write the table body - switch (sortingWay) { - case SORT_BY_COLUMN: - Collections.sort(underlyingData, new Comparator() { - //INVARIANT the two arrays are of the same length and corresponding elements are of the same type - @Override - public int compare(Object[] objectArr1, Object[] objectArr2) { - final int EQUAL = 0; - - int result = EQUAL; - - int l = objectArr1.length; - for (int x = 0; x < l; x++) { - if (objectArr1[x] instanceof Integer) { - result = ((Integer)objectArr1[x]).compareTo((Integer)objectArr2[x]); - } else if (objectArr1[x] instanceof Double) { - result = ((Double)objectArr1[x]).compareTo((Double)objectArr2[x]); - } else { // default uses String comparison - result = objectArr1[x].toString().compareTo(objectArr2[x].toString()); - } - if( result != EQUAL) { - return result; - } - } - return result; - } - }); - for ( final Object[] row : underlyingData ) - writeRow(out, row); - break; - case SORT_BY_ROW: - // make sure that there are exactly the correct number of ID mappings - if ( rowIdToIndex.size() != underlyingData.size() ) - throw new ReviewedGATKException("There isn't a 1-to-1 mapping from row ID to index; this can happen when rows are not created consistently"); - - final TreeMap sortedMap; - try { - sortedMap = new TreeMap(rowIdToIndex); - } catch (ClassCastException e) { - throw new ReviewedGATKException("Unable to sort the rows based on the row IDs because the ID Objects are of different types"); - } - for ( final Map.Entry rowKey : sortedMap.entrySet() ) - writeRow(out, underlyingData.get(rowKey.getValue())); - break; - case DO_NOT_SORT: - for ( final Object[] row : underlyingData ) - writeRow(out, row); - } - out.println(); - } - - private void writeRow(final PrintStream out, final Object[] row) { - boolean needsPadding = false; - for ( int i = 0; i < row.length; i++ ) { - if ( needsPadding ) - out.printf(" "); - needsPadding = true; - - final Object obj = row[i]; - final String value; - - final GATKReportColumn info = columnInfo.get(i); - - if ( obj == null ) - value = "null"; - else if ( info.getDataType().equals(GATKReportDataType.Unknown) && (obj instanceof Double || obj instanceof Float) ) - value = String.format("%.8f", obj); - else - value = String.format(info.getFormat(), obj); - - out.printf(info.getColumnFormat().getValueFormat(), value); - } - - out.println(); - } - - public int getNumRows() { - return underlyingData.size(); - } - - public int getNumColumns() { - return columnInfo.size(); - } - - public List getColumnInfo() { - return columnInfo; - } - - public String getTableName() { - return tableName; - } - - public String getTableDescription() { - return tableDescription; - } - - /** - * Concatenates the rows from the table to this one - * - * @param table another GATK table - */ - public void concat(final GATKReportTable table) { - if ( !isSameFormat(table) ) - throw new ReviewedGATKException("Error trying to concatenate tables with different formats"); - - // add the data - underlyingData.addAll(table.underlyingData); - - // update the row index map - final int currentNumRows = getNumRows(); - for ( Map.Entry entry : table.rowIdToIndex.entrySet() ) - rowIdToIndex.put(entry.getKey(), entry.getValue() + currentNumRows); - } - - /** - * Returns whether or not the two tables have the same format including columns and everything in between. This does - * not check if the data inside is the same. This is the check to see if the two tables are gatherable or - * reduceable - * - * @param table another GATK table - * @return true if the the tables are gatherable - */ - public boolean isSameFormat(final GATKReportTable table) { - if ( !tableName.equals(table.tableName) || - !tableDescription.equals(table.tableDescription) || - columnInfo.size() != table.columnInfo.size() ) - return false; - - for ( int i = 0; i < columnInfo.size(); i++ ) { - if ( !columnInfo.get(i).getFormat().equals(table.columnInfo.get(i).getFormat()) || - !columnInfo.get(i).getColumnName().equals(table.columnInfo.get(i).getColumnName()) ) - return false; - } - - return true; - } - - /** - * Checks that the tables are exactly the same. - * - * @param table another GATK report - * @return true if all field in the reports, tables, and columns are equal. - */ - public boolean equals(final GATKReportTable table) { - if ( !isSameFormat(table) || - underlyingData.size() != table.underlyingData.size() ) - return false; - - final List myOrderedRows = getOrderedRows(); - final List otherOrderedRows = table.getOrderedRows(); - - for ( int i = 0; i < underlyingData.size(); i++ ) { - final Object[] myData = myOrderedRows.get(i); - final Object[] otherData = otherOrderedRows.get(i); - for ( int j = 0; j < myData.length; j++ ) { - if ( !myData[j].toString().equals(otherData[j].toString()) ) // need to deal with different typing (e.g. Long vs. Integer) - return false; - } - } - - return true; - } - - private List getOrderedRows() { - - switch (sortingWay) { - case SORT_BY_COLUMN: - Collections.sort(underlyingData, new Comparator() { - //INVARIANT the two arrays are of the same length and corresponding elements are of the same type - @Override - public int compare(Object[] objectArr1, Object[] objectArr2) { - final int EQUAL = 0; - int result = EQUAL; - int l = objectArr1.length; - for (int x = 0; x < l; x++) { - if (objectArr1[x] instanceof Integer) { - result = ((Integer)objectArr1[x]).compareTo((Integer)objectArr2[x]); - } else if (objectArr1[x] instanceof Double) { - result = ((Double)objectArr1[x]).compareTo((Double)objectArr2[x]); - } else { // default uses String comparison - result = objectArr1[x].toString().compareTo(objectArr2[x].toString()); - } - if( result != EQUAL) { - return result; - } - } - return result; - } - }); - return underlyingData; - case SORT_BY_ROW: - final TreeMap sortedMap; - try { - sortedMap = new TreeMap(rowIdToIndex); - } catch (ClassCastException e) { - return underlyingData; - } - - final List orderedData = new ArrayList(underlyingData.size()); - for ( final int rowKey : sortedMap.values() ) - orderedData.add(underlyingData.get(rowKey)); - - return orderedData; - default: - return underlyingData; - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportVersion.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportVersion.java deleted file mode 100644 index 226365b80..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportVersion.java +++ /dev/null @@ -1,101 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.report; - -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -public enum GATKReportVersion { - /** - * Differences between other versions: - * - Does not allow spaces in cells. - * - Mostly fixed width but has a bug where the string width of floating point - * values was not measured correctly leading to columns that aren't aligned - */ - V0_1("v0.1"), - - /** - * Differences between other versions: - * - Spaces allowed in cells, for example in sample names with spaces in them ex: "C507/FG-CR 6". - * - Fixed width fixed for floating point values - */ - V0_2("v0.2"), - - /* - * Differences between v0.x - * - Added table and report headers - * - Headers changed format, include the number of tables, rows, and metadata for gathering - * - IS GATHERABLE - */ - V1_0("v1.0"), - - /* - * Differences between v1.0 - * - column numbers in header reflect the actual count of columns - * - primary keys are never displayed - */ - V1_1("v1.1"); - - private final String versionString; - - private GATKReportVersion(String versionString) { - this.versionString = versionString; - } - - @Override - public String toString() { - return versionString; - } - - public boolean equals(GATKReportVersion that) { - return (versionString.equals(that.versionString)); - } - - /** - * Returns the GATK Report Version from the file header. - * - * @param header Header from the file starting with ##:GATKReport.v[version] - * @return The version as an enum. - */ - public static GATKReportVersion fromHeader(String header) { - if ( header == null ) - throw new UserException.BadInput("The GATK report has no version specified in the header"); - - if (header.startsWith("##:GATKReport.v0.1 ")) - return GATKReportVersion.V0_1; - - if (header.startsWith("##:GATKReport.v0.2 ")) - return GATKReportVersion.V0_2; - - if (header.startsWith("#:GATKReport.v1.0")) - return GATKReportVersion.V1_0; - - if (header.startsWith("#:GATKReport.v1.1")) - return GATKReportVersion.V1_1; - - throw new UserException.BadInput("The GATK report has an unknown/unsupported version in the header: " + header); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/SampleDBBuilder.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/SampleDBBuilder.java deleted file mode 100644 index 6fdb9fa0a..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/SampleDBBuilder.java +++ /dev/null @@ -1,161 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.samples; - -import htsjdk.samtools.SAMFileHeader; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.SampleUtils; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.*; - -/** - * - */ -public class SampleDBBuilder { - PedigreeValidationType validationStrictness; - final SampleDB sampleDB = new SampleDB(); - final GenomeAnalysisEngine engine; - - Set samplesFromDataSources = new HashSet(); - Set samplesFromPedigrees = new HashSet(); - - /** for testing only */ - protected SampleDBBuilder(PedigreeValidationType validationStrictness) { - engine = null; - this.validationStrictness = validationStrictness; - } - - /** - * Constructor takes both a SAM header and sample files because the two must be integrated. - */ - public SampleDBBuilder(GenomeAnalysisEngine engine, PedigreeValidationType validationStrictness) { - this.engine = engine; - this.validationStrictness = validationStrictness; - } - - /** - * Hallucinates sample objects for all the samples in the SAM file and stores them - */ - public SampleDBBuilder addSamplesFromSAMHeader(final SAMFileHeader header) { - addSamplesFromSampleNames(SampleUtils.getSAMFileSamples(header)); - return this; - } - - public SampleDBBuilder addSamplesFromSampleNames(final Collection sampleNames) { - for (final String sampleName : sampleNames) { - if (sampleDB.getSample(sampleName) == null) { - final Sample newSample = new Sample(sampleName, sampleDB); - sampleDB.addSample(newSample); - samplesFromDataSources.add(newSample); // keep track of data source samples - } - } - return this; - } - - public SampleDBBuilder addSamplesFromPedigreeFiles(final List pedigreeFiles) { - for (final File pedFile : pedigreeFiles) { - Collection samples = addSamplesFromPedigreeArgument(pedFile); - samplesFromPedigrees.addAll(samples); - } - - return this; - } - - public SampleDBBuilder addSamplesFromPedigreeStrings(final List pedigreeStrings) { - for (final String pedString : pedigreeStrings) { - Collection samples = addSamplesFromPedigreeArgument(pedString); - samplesFromPedigrees.addAll(samples); - } - - return this; - } - - /** - * Parse one sample file and integrate it with samples that are already there - * Fail quickly if we find any errors in the file - */ - private Collection addSamplesFromPedigreeArgument(File sampleFile) { - final PedReader reader = new PedReader(); - - try { - return reader.parse(sampleFile, getMissingFields(sampleFile), sampleDB); - } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(sampleFile, e); - } - } - - private Collection addSamplesFromPedigreeArgument(final String string) { - final PedReader reader = new PedReader(); - return reader.parse(string, getMissingFields(string), sampleDB); - } - - public SampleDB getFinalSampleDB() { - validate(); - return sampleDB; - } - - public EnumSet getMissingFields(final Object engineArg) { - if ( engine == null ) - return EnumSet.noneOf(PedReader.MissingPedField.class); - else { - final List posTags = engine.getTags(engineArg).getPositionalTags(); - return PedReader.parseMissingFieldTags(engineArg, posTags); - } - } - - // -------------------------------------------------------------------------------- - // - // Validation - // - // -------------------------------------------------------------------------------- - - protected final void validate() { - validatePedigreeIDUniqueness(); - if ( validationStrictness != PedigreeValidationType.SILENT ) { - // check that samples in data sources are all annotated, if anything is annotated - if ( ! samplesFromPedigrees.isEmpty() && ! samplesFromDataSources.isEmpty() ) { - final Set sampleNamesFromPedigrees = new HashSet(); - for ( final Sample pSample : samplesFromPedigrees ) - sampleNamesFromPedigrees.add(pSample.getID()); - - for ( final Sample dsSample : samplesFromDataSources ) - if ( ! sampleNamesFromPedigrees.contains(dsSample.getID()) ) - throw new UserException("Sample " + dsSample.getID() + " found in data sources but not in pedigree files with STRICT pedigree validation"); - } - } - } - - private void validatePedigreeIDUniqueness() { - Set pedigreeIDs = new HashSet(); - for ( Sample sample : samplesFromPedigrees ) { - pedigreeIDs.add(sample.getID()); - } - assert pedigreeIDs.size() == samplesFromPedigrees.size() : "The number of sample IDs extracted from the pedigree does not equal the number of samples in the pedigree. Is a sample associated with multiple families?"; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCache.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCache.java deleted file mode 100644 index d28ea3be4..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCache.java +++ /dev/null @@ -1,168 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.traversals; - -import org.broadinstitute.gatk.engine.downsampling.Downsampler; -import org.broadinstitute.gatk.engine.downsampling.ReservoirDownsampler; -import org.broadinstitute.gatk.utils.sam.AlignmentStartComparator; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -/** - * Subsystem to track a list of all reads currently live in the TraverseActiveRegions system, - * while limiting the total number of reads to a maximum capacity. - * - * User: depristo - * Date: 4/7/13 - * Time: 11:23 AM - */ -public class TAROrderedReadCache { - private final int maxCapacity; - private ArrayList undownsampledCache; - private Downsampler downsampler; - - private static final int UNDOWNSAMPLED_CACHE_MAX_INITIAL_SIZE = 10000; - - /** - * Create a new empty ReadCache - * @param maxCapacity the max capacity of the read cache. - */ - public TAROrderedReadCache( final int maxCapacity ) { - if ( maxCapacity < 0 ) throw new IllegalArgumentException("maxCapacity must be >= 0 but got " + maxCapacity); - this.maxCapacity = maxCapacity; - - // The one we're not currently using will always be null: - initializeUndownsampledCache(); - this.downsampler = null; - } - - /** - * Moves all reads over to the downsampler, causing it to be used from this point on. Should be called - * when the undownsampledCache fills up and we need to start discarding reads. Since the - * ReservoirDownsampler doesn't preserve relative ordering, pop operations become expensive - * after this point, as they require a O(n log n) sort. - */ - private void activateDownsampler() { - downsampler = new ReservoirDownsampler<>(maxCapacity, false); - downsampler.submit(undownsampledCache); - undownsampledCache = null; // preferable to the O(n) clear() method - } - - /** - * Allocate the undownsampled cache used when we have fewer than maxCapacity items - */ - private void initializeUndownsampledCache() { - undownsampledCache = new ArrayList<>(Math.min(maxCapacity + 1, UNDOWNSAMPLED_CACHE_MAX_INITIAL_SIZE)); - } - - /** - * What's the maximum number of reads we'll store in the cache? - * @return a positive integer - */ - public int getMaxCapacity() { - return maxCapacity; - } - - /** - * Add a single read to this cache. Assumed to be in sorted order w.r.t. the previously added reads - * @param read a read to add - */ - public void add( final GATKSAMRecord read ) { - if ( read == null ) throw new IllegalArgumentException("Read cannot be null"); - - if ( downsampler != null ) { - downsampler.submit(read); - } - else { - undownsampledCache.add(read); - - // No more room in the undownsampledCache? Time to start downsampling - if ( undownsampledCache.size() > maxCapacity ) { - activateDownsampler(); - } - } - } - - /** - * Add a collection of reads to this cache. Assumed to be in sorted order w.r.t. the previously added reads and each other - * @param reads a collection of reads to add - */ - public void addAll( final List reads ) { - if ( reads == null ) throw new IllegalArgumentException("Reads cannot be null"); - for ( final GATKSAMRecord read : reads ) { - add(read); - } - } - - /** - * How many reads are currently in the cache? - * @return a positive integer - */ - public int size() { - return downsampler != null ? downsampler.size() : undownsampledCache.size(); - } - - /** - * How many reads were discarded since the last call to popCurrentReads - * - * @return number of items discarded during downsampling since last pop operation - */ - public int getNumDiscarded() { - return downsampler != null ? downsampler.getNumberOfDiscardedItems() : 0; - } - - /** - * Removes all reads currently in the cache, and returns them in sorted order (w.r.t. alignmentStart) - * - * Flushes this cache, so after this call the cache will contain no reads, and we'll be in the same - * initial state as the constructor would put us in, with a non-null undownsampledCache and a null - * downsampler. - * - * @return a list of GATKSAMRecords in this cache - */ - public List popCurrentReads() { - final List poppedReads; - - if ( downsampler == null ) { - poppedReads = undownsampledCache; // avoid making a copy here, since we're going to allocate a new cache - } - else { - // If we triggered the downsampler, we need to sort the reads before returning them, - // since the ReservoirDownsampler is not guaranteed to preserve relative ordering of items. - // After consuming the downsampled items in this call to popCurrentReads(), we switch back - // to using the undownsampledCache until we fill up again. - poppedReads = downsampler.consumeFinalizedItems(); // avoid making a copy here - Collections.sort(poppedReads, new AlignmentStartComparator()); - downsampler = null; - } - - initializeUndownsampledCache(); - return poppedReads; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegions.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegions.java deleted file mode 100644 index 7d93311f2..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegions.java +++ /dev/null @@ -1,719 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.traversals; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.datasources.providers.*; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.walkers.ActiveRegionTraversalParameters; -import org.broadinstitute.gatk.engine.walkers.ActiveRegionWalker; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.SampleUtils; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; -import org.broadinstitute.gatk.utils.activeregion.ActivityProfile; -import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState; -import org.broadinstitute.gatk.utils.activeregion.BandPassActivityProfile; -import org.broadinstitute.gatk.utils.nanoScheduler.NSMapFunction; -import org.broadinstitute.gatk.utils.nanoScheduler.NSProgressFunction; -import org.broadinstitute.gatk.utils.nanoScheduler.NSReduceFunction; -import org.broadinstitute.gatk.utils.nanoScheduler.NanoScheduler; -import org.broadinstitute.gatk.utils.progressmeter.ProgressMeter; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.io.PrintStream; -import java.util.Collection; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; - -/** - * Implement active region traversal - * - * User: depristo - * Date: 1/9/13 - * Time: 4:45 PM - * - * Live region: - * - * The ART tracks a thing called the live region. The live region is a position on a specific contig - * of the alignment start of the last read we processed during this traversal. Because the - * read stream is sorted, future reads must occurs in the the live region. Therefore the the dead region - * (everything to the left of the live boundary) cannot have any more read data. The live / dead - * regions are used to decide when we can safely call map on active regions, as only active regions - * contained completely within the dead region (including extensions) have a complete set of read data - * in the collected read list. All of the data related to the live region is captured by the local - * variable spanOfLastReadSeen - * - */ -public final class TraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { - private final static boolean DEBUG = false; - protected final static Logger logger = Logger.getLogger(TraversalEngine.class); - protected final static boolean LOG_READ_CARRYING = false; - - // set by the traversal - private boolean walkerHasPresetRegions = false; - private int activeRegionExtension = -1; - private int maxRegionSize = -1; - private int minRegionSize = -1; - - private final LinkedList workQueue = new LinkedList<>(); - - private TAROrderedReadCache myReads = null; - - private GenomeLoc lastRegionProcessed = null; - private GenomeLoc spanOfLastReadSeen = null; - private ActivityProfile activityProfile = null; - int maxReadsInMemory = 0; - ActiveRegionWalker walker; - - final NanoScheduler nanoScheduler; - - /** - * Data to use in the ActiveRegionWalker.map function produced by the NanoScheduler input iterator - */ - private static class MapData { - public ActiveRegion activeRegion; - public RefMetaDataTracker tracker; - - private MapData(ActiveRegion activeRegion, RefMetaDataTracker tracker) { - this.activeRegion = activeRegion; - this.tracker = tracker; - } - } - - /** - * Create a single threaded active region traverser - */ - public TraverseActiveRegions() { - this(1); - } - - /** - * Create an active region traverser that uses nThreads for getting its work done - * @param nThreads number of threads - */ - public TraverseActiveRegions(final int nThreads) { - nanoScheduler = new NanoScheduler<>(nThreads); - nanoScheduler.setProgressFunction(new NSProgressFunction() { - @Override - public void progress(MapData lastActiveRegion) { - if ( lastActiveRegion != null ) - // note, need to use getStopLocation so we don't give an interval to ProgressMeterDaemon - printProgress(lastActiveRegion.activeRegion.getLocation().getStopLocation()); - } - }); - } - - /** - * Have the debugging output streams been initialized already? - * - * We have to do lazy initialization because when the initialize() function is called - * the streams aren't yet initialized in the GATK walker. - */ - private boolean streamsInitialized = false; - - @Override - public void initialize(GenomeAnalysisEngine engine, Walker walker, ProgressMeter progressMeter) { - super.initialize(engine, walker, progressMeter); - - this.walker = (ActiveRegionWalker)walker; - if ( this.walker.wantsExtendedReads() && ! this.walker.wantsNonPrimaryReads() ) { - throw new IllegalArgumentException("Active region walker " + this.walker + " requested extended events but not " + - "non-primary reads, an inconsistent state. Please modify the walker"); - } - - ActiveRegionTraversalParameters annotation = walker.getClass().getAnnotation(ActiveRegionTraversalParameters.class); - this.activeRegionExtension = this.walker.activeRegionExtension == null ? annotation.extension() : this.walker.activeRegionExtension; - this.maxRegionSize = this.walker.activeRegionMaxSize == null ? annotation.maxRegion() : this.walker.activeRegionMaxSize; - this.minRegionSize = annotation.minRegion(); - final double bandPassSigma = this.walker.bandPassSigma == null ? annotation.bandPassSigma() : this.walker.bandPassSigma; - walkerHasPresetRegions = this.walker.hasPresetActiveRegions(); - - activityProfile = new BandPassActivityProfile(engine.getGenomeLocParser(), engine.getIntervals(), this.walker.maxProbPropagationDistance, this.walker.activeProbThreshold, - BandPassActivityProfile.MAX_FILTER_SIZE, bandPassSigma); - - final int maxReadsAcrossSamples = annotation.maxReadsToHoldInMemoryPerSample() * SampleUtils.getSAMFileSamples(engine).size(); - final int maxReadsToHoldInMemory = Math.min(maxReadsAcrossSamples, annotation.maxReadsToHoldTotal()); - myReads = new TAROrderedReadCache(maxReadsToHoldInMemory); - } - - // ------------------------------------------------------------------------------------- - // - // Utility functions - // - // ------------------------------------------------------------------------------------- - - /** - * Load in the preset regions for contig into workQueue - * - * Should be called before starting to process work on contig - * - * Can only be called when walkerHasPresetRegions is true or an IllegalStateException will be thrown - * - * @param contig the contig we are about to process - */ - protected void loadPresetRegionsForContigToWorkQueue(final String contig) { - if ( ! walkerHasPresetRegions ) throw new IllegalStateException("only appropriate to call when walker has preset regions"); - - final GenomeLoc contigSpan = engine.getGenomeLocParser().createOverEntireContig(contig); - for ( final GenomeLoc loc : this.walker.getPresetActiveRegions().getOverlapping(contigSpan) ) { - workQueue.add(new ActiveRegion(loc, null, true, engine.getGenomeLocParser(), getActiveRegionExtension())); - } - } - - protected int getActiveRegionExtension() { - return activeRegionExtension; - } - - protected int getMaxRegionSize() { - return maxRegionSize; - } - - protected int getMinRegionSize() { - return minRegionSize; - } - - @Override - public String getTraversalUnits() { - return "active regions"; - } - - @Override - public String toString() { - return "TraverseActiveRegions"; - } - - /** - * Is the loc outside of the intervals being requested for processing by the GATK? - * @param loc - * @return - */ - protected boolean outsideEngineIntervals(final GenomeLoc loc) { - return engine.getIntervals() != null && ! engine.getIntervals().overlaps(loc); - } - - // ------------------------------------------------------------------------------------- - // - // Actual traverse function - // - // ------------------------------------------------------------------------------------- - - /** - * Did read appear in the last shard? - * - * When we transition across shard boundaries we see duplicate reads because - * each shard contains the reads that *overlap* the shard. So if we just finished - * shard 1-1000 and are now in 1001-2000 we'll see duplicate reads from 1001 - * that overlapped 1-1000. This function tests read to determine if we would have - * seen it before by asking if read.getAlignmentStart() is less than the - * stop position of the last seen read at the start of the traversal. The reason - * we need to use the location of the last read at the start of the traversal - * is that we update the lastRead during the traversal, and we only want to filter - * out reads whose start is before the last read of the previous shard, not the - * current shard. - * - * @param locOfLastReadAtTraversalStart the location of the last read seen at the start of the traversal - * @param read the read we want to test if it's already been seen in the last shard - * @return true if read would have appeared in the last shard, false otherwise - */ - @Requires({"read != null"}) - private boolean appearedInLastShard(final GenomeLoc locOfLastReadAtTraversalStart, final GATKSAMRecord read) { - if ( locOfLastReadAtTraversalStart == null ) - // we're in the first shard, so obviously the answer is no - return false; - else { - // otherwise check to see if the alignment occurred in the previous shard - return read.getAlignmentStart() <= locOfLastReadAtTraversalStart.getStart() - // we're on the same contig - && read.getReferenceIndex() == locOfLastReadAtTraversalStart.getContigIndex(); - } - - } - - @Override - public T traverse( final ActiveRegionWalker walker, - final LocusShardDataProvider dataProvider, - T sum) { - if ( LOG_READ_CARRYING || logger.isDebugEnabled() ) - logger.info(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider)); - - nanoScheduler.setDebug(false); - final Iterator activeRegionIterator = new ActiveRegionIterator(dataProvider); - final TraverseActiveRegionMap myMap = new TraverseActiveRegionMap(); - final TraverseActiveRegionReduce myReduce = new TraverseActiveRegionReduce(); - final T result = nanoScheduler.execute(activeRegionIterator, myMap, sum, myReduce); - - return result; - } - - private class ActiveRegionIterator implements Iterator { - private final LocusShardDataProvider dataProvider; - private LinkedList readyActiveRegions = new LinkedList<>(); - private boolean done = false; - private final LocusView locusView; - private final LocusReferenceView referenceView; - private final GenomeLoc locOfLastReadAtTraversalStart; - private final IntervalReferenceOrderedView referenceOrderedDataView; - private final GenomeLoc currentWindow; - private final boolean processRemainingActiveRegions; - - public ActiveRegionIterator( final LocusShardDataProvider dataProvider ) { - this.dataProvider = dataProvider; - locusView = new AllLocusView(dataProvider); - referenceView = new LocusReferenceView( walker, dataProvider ); - - // The data shard may carry a number of locations to process (due to being indexed together). - // This value is just the interval we are processing within the entire provider - currentWindow = dataProvider.getLocus(); - final int currentWindowPos = dataProvider.getShard().getGenomeLocs().indexOf(currentWindow); - if ( currentWindowPos == -1 ) throw new IllegalStateException("Data provider " + dataProvider + " didn't have our current window in it " + currentWindow); - processRemainingActiveRegions = currentWindowPos == dataProvider.getShard().getGenomeLocs().size() - 1; - - // the rodSpan covers all of the bases in the activity profile, including all of the bases - // through the current window interval. This is because we may issue a query to get data for an - // active region spanning before the current interval as far back as the start of the current profile, - // if we have pending work to do that finalizes in this interval. - final GenomeLoc rodSpan = activityProfile.getSpan() == null ? currentWindow : activityProfile.getSpan().endpointSpan(currentWindow); - if ( ! dataProvider.getShard().getLocation().containsP(rodSpan) ) throw new IllegalStateException("Rod span " + rodSpan + " isn't contained within the data shard " + dataProvider.getShard().getLocation() + ", meaning we wouldn't get all of the data we need"); - referenceOrderedDataView = new IntervalReferenceOrderedView( dataProvider, rodSpan ); - - // We keep processing while the next reference location is within the interval - locOfLastReadAtTraversalStart = spanOfLastSeenRead(); - - // load in the workQueue the present regions that span the current contig, if it's different from the last one - if ( walkerHasPresetRegions && ( lastRegionProcessed == null || ! currentWindow.onSameContig(lastRegionProcessed)) ) { - loadPresetRegionsForContigToWorkQueue(currentWindow.getContig()); - } - - // remember the last region we processed for sanity checking later - lastRegionProcessed = currentWindow; - } - - @Override public void remove() { throw new UnsupportedOperationException("Cannot remove from ActiveRegionIterator"); } - - @Override - public MapData next() { - return readyActiveRegions.pop(); - } - @Override - public boolean hasNext() { - if ( engine.exceedsRuntimeLimit() ) // too much time has been dedicated to doing work, just stop - return false; - if ( ! readyActiveRegions.isEmpty() ) - return true; - if ( done ) - return false; - else { - - while( locusView.hasNext() ) { - final AlignmentContext locus = locusView.next(); - final GenomeLoc location = locus.getLocation(); - - rememberLastLocusLocation(location); - - // get all of the new reads that appear in the current pileup, and them to our list of reads - // provided we haven't seen them before - final Collection reads = locusView.getLIBS().transferReadsFromAllPreviousPileups(); - for( final GATKSAMRecord read : reads ) { - // note that ActiveRegionShards span entire contigs, so this check is in some - // sense no longer necessary, as any read that appeared in the last shard would now - // by definition be on a different contig. However, the logic here doesn't hurt anything - // and makes us robust should we decided to provide shards that don't fully span - // contigs at some point in the future - if ( ! appearedInLastShard(locOfLastReadAtTraversalStart, read) ) { - rememberLastReadLocation(read); - myReads.add(read); - } - } - - // skip this location -- it's not part of our engine intervals - if ( outsideEngineIntervals(location) ) - continue; - - // we've move across some interval boundary, restart profile - final boolean flushProfile = ! activityProfile.isEmpty() - && ( activityProfile.getContigIndex() != location.getContigIndex() - || location.getStart() != activityProfile.getStop() + 1); - final List newActiveRegions = prepActiveRegionsForProcessing(walker, flushProfile, false, referenceOrderedDataView); - - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation()); - - // Call the walkers isActive function for this locus and add them to the list to be integrated later - addIsActiveResult(walker, tracker, refContext, locus); - - maxReadsInMemory = Math.max(myReads.size(), maxReadsInMemory); - printProgress(location); - - if ( ! newActiveRegions.isEmpty() ) { - readyActiveRegions.addAll(newActiveRegions); - if ( DEBUG ) - for ( final MapData region : newActiveRegions ) - logger.info("Adding region to queue for processing " + region.activeRegion); - return true; - } - } - - if ( processRemainingActiveRegions ) { - // we've run out of stuff to process, and since shards now span entire contig boundaries - // we should finalized our regions. This allows us to continue to use our referenceOrderedDataView - // which would otherwise be shutdown. Only followed when the microschedule says that we're - // inside of the last window in the current shard - readyActiveRegions.addAll(prepActiveRegionsForProcessing(walker, true, true, referenceOrderedDataView)); - } - - return ! readyActiveRegions.isEmpty(); - } - } - } - - // ------------------------------------------------------------------------------------- - // - // Functions to manage and interact with the live / dead zone - // - // ------------------------------------------------------------------------------------- - - /** - * Update the live region to reflect that the last read we've seen in the traversal is read - * - * Requires that sequential calls always be provided reads in coordinate sorted order - * - * @param read the last read we've seen during the traversal - */ - @Requires({"read != null"}) - protected void rememberLastReadLocation(final GATKSAMRecord read) { - final GenomeLoc currentLocation = engine.getGenomeLocParser().createGenomeLoc(read); - if ( spanOfLastReadSeen == null ) - spanOfLastReadSeen = currentLocation; - else { - if ( currentLocation.isBefore(spanOfLastReadSeen) ) - throw new IllegalStateException("Updating last read seen in the traversal with read " + read + " with span " + currentLocation + " but this occurs before the previously seen read " + spanOfLastReadSeen); - spanOfLastReadSeen = currentLocation; - } - } - - /** - * Update the live region to reflect that we've reached locus - * - * This function is complementary to #rememberLastReadLocation, but if we don't have any reads for a long - * time (e.g., there's no coverage) we will keep active regions around far longer than necessary. - * - * Only updates the span if it's beyond the last seen - * - * @param currentLocation the current location we've processed on the genome - */ - protected void rememberLastLocusLocation(final GenomeLoc currentLocation) { - if ( spanOfLastReadSeen == null ) - spanOfLastReadSeen = currentLocation; - else { - if ( currentLocation.isPast(spanOfLastReadSeen) ) - spanOfLastReadSeen = currentLocation; - } - } - - - /** - * Get a GenomeLoc indicating the start (heading to the right) of the live ART region. - * @return the left-most position of the live region on the genome - */ - protected GenomeLoc spanOfLastSeenRead() { - return spanOfLastReadSeen; - } - - /** - * Is the active region completely within the traversal's dead zone? - * - * @param region the region we want to test - * @return true if the extended location of region is completely within the current dead zone, false otherwise - */ - protected boolean regionCompletelyWithinDeadZone(final ActiveRegion region) { - if ( spanOfLastSeenRead() == null ) - return false; - - final int contigCmp = region.getExtendedLoc().compareContigs(spanOfLastSeenRead()); - if ( contigCmp > 0 ) - throw new IllegalStateException("Active region " + region + " on a contig after last seen read " + spanOfLastSeenRead()); - else { - return contigCmp < 0 || region.getExtendedLoc().getStop() < spanOfLastSeenRead().getStart(); - } - } - - /** - * Is the read dead? That is, can it no longer be in any future active region, and therefore can be discarded? - * - * read: start |--------> stop ------ stop + extension - * region: start |-----------------| end - * - * Since the regions are coming in order, read could potentially be contained in a future interval if - * stop + activeRegionExtension >= end. If, on the other hand, stop + extension is < the end - * of this region, then we can discard it, since any future region could only include reads - * up to end + 1 - extension. - * - * Note that this function doesn't care about the dead zone. We're assuming that by - * actually calling this function with an active region that region is already in the dead zone, - * so checking that the read is in the dead zone doesn't make sense. - * - * @param read the read we're testing - * @param activeRegion the current active region - * @return true if the read is dead, false other - */ - @Requires({"read != null", "activeRegion != null"}) - private boolean readCannotOccurInAnyMoreActiveRegions(final GATKSAMRecord read, final ActiveRegion activeRegion) { - return read.getReferenceIndex() < activeRegion.getLocation().getContigIndex() || - ( read.getReferenceIndex() == activeRegion.getLocation().getContigIndex() - && read.getAlignmentEnd() + getActiveRegionExtension() < activeRegion.getLocation().getStop() ); - } - - // ------------------------------------------------------------------------------------- - // - // Functions to write out activity profiles and active regions - // - // ------------------------------------------------------------------------------------- - - /** - * Initialize the debugging output streams (activity profile and active regions), if not done so already - */ - @Ensures("streamsInitialized == true") - private void initializeOutputStreamsIfNecessary() { - if ( ! streamsInitialized ) { - streamsInitialized = true; - if ( walker.activityProfileOutStream != null ) { - printIGVFormatHeader(walker.activityProfileOutStream, "line", "ActivityProfile"); - } - - if ( walker.activeRegionOutStream != null ) { - printIGVFormatHeader(walker.activeRegionOutStream, "line", "ActiveRegions"); - } - } - } - - /** - * Helper function to write out a IGV formatted line to out, at loc, with values - * - * http://www.broadinstitute.org/software/igv/IGV - * - * @param out a non-null PrintStream where we'll write our line - * @param graphType the type of graph to show in IGV for this track - * @param columns the column names for this IGV track - */ - @Requires({ - "out != null", - "graphType != null", - "columns.length > 0" - }) - private void printIGVFormatHeader(final PrintStream out, final String graphType, final String ... columns ) { - out.printf("#track graphType=%s%n", graphType); - out.printf("Chromosome\tStart\tEnd\tFeature\t%s%n", Utils.join("\t", columns)); - - } - - /** - * Helper function to write out a IGV formatted line to out, at loc, with values - * - * http://www.broadinstitute.org/software/igv/IGV - * - * @param out a non-null PrintStream where we'll write our line - * @param loc the location of values - * @param featureName string name of this feature (see IGV format) - * @param values the floating point values to associate with loc and feature name in out - */ - @Requires({ - "out != null", - "loc != null", - "values.length > 0" - }) - private void printIGVFormatRow(final PrintStream out, final GenomeLoc loc, final String featureName, final double ... values) { - // note that start and stop are 0 based, but the stop is exclusive so we don't subtract 1 - out.printf("%s\t%d\t%d\t%s", loc.getContig(), loc.getStart() - 1, loc.getStop(), featureName); - for ( final double value : values ) - out.print(String.format("\t%.5f", value)); - out.println(); - } - - /** - * Write out activity profile information, if requested by the walker - * - * @param states the states in the current activity profile - */ - @Requires("states != null") - private void writeActivityProfile(final List states) { - if ( walker.activityProfileOutStream != null ) { - initializeOutputStreamsIfNecessary(); - for ( final ActivityProfileState state : states ) { - printIGVFormatRow(walker.activityProfileOutStream, state.getLoc(), "state", Math.min(state.isActiveProb, 1.0)); - } - } - } - - /** - * Write out each active region to the walker activeRegionOutStream - * - * @param region the region we're currently operating on - */ - @Requires("region != null") - private void writeActiveRegion(final ActiveRegion region) { - if( walker.activeRegionOutStream != null ) { - initializeOutputStreamsIfNecessary(); - printIGVFormatRow(walker.activeRegionOutStream, region.getLocation().getStartLocation(), - "end-marker", 0.0); - printIGVFormatRow(walker.activeRegionOutStream, region.getLocation(), - "size=" + region.getLocation().size(), region.isActive() ? 1.0 : -1.0); - } - } - - - // ------------------------------------------------------------------------------------- - // - // Functions to process active regions that are ready for map / reduce calls - // - // ------------------------------------------------------------------------------------- - - /** - * Invoke the walker isActive function, and incorporate its result into the activity profile - * - * @param walker the walker we're running - * @param tracker the ref meta data tracker to pass on to the isActive function of walker - * @param refContext the refContext to pass on to the isActive function of walker - * @param locus the AlignmentContext to pass on to the isActive function of walker - */ - private void addIsActiveResult(final ActiveRegionWalker walker, - final RefMetaDataTracker tracker, final ReferenceContext refContext, - final AlignmentContext locus) { - // must be called, even if we won't use the result, to satisfy walker contract - final ActivityProfileState state = walker.isActive( tracker, refContext, locus ); - if ( walker.forceActive) state.isActiveProb = 1.0; - if ( ! walkerHasPresetRegions ) { - activityProfile.add(state); - } - } - - /** - * Take the individual isActive calls and integrate them into contiguous active regions and - * add these blocks of work to the work queue - * band-pass filter the list of isActive probabilities and turn into active regions - */ - private List prepActiveRegionsForProcessing(final ActiveRegionWalker walker, - final boolean flushActivityProfile, - final boolean forceAllRegionsToBeActive, - final IntervalReferenceOrderedView referenceOrderedDataView) { - if ( ! walkerHasPresetRegions ) { - // We don't have preset regions, so we get our regions from the activity profile - final Collection activeRegions = activityProfile.popReadyActiveRegions(getActiveRegionExtension(), getMinRegionSize(), getMaxRegionSize(), flushActivityProfile); - workQueue.addAll(activeRegions); - if ( ! activeRegions.isEmpty() && logger.isDebugEnabled() ) logger.debug("Integrated " + activityProfile.size() + " isActive calls into " + activeRegions.size() + " regions." ); - } - - // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them - final LinkedList readyRegions = new LinkedList<>(); - while( workQueue.peek() != null ) { - final ActiveRegion activeRegion = workQueue.peek(); - if ( forceAllRegionsToBeActive || regionCompletelyWithinDeadZone(activeRegion) ) { - writeActivityProfile(activeRegion.getSupportingStates()); - writeActiveRegion(activeRegion); - readyRegions.add(prepActiveRegionForProcessing(workQueue.remove(), walker, referenceOrderedDataView)); - } else { - break; - } - } - - return readyRegions; - - } - - private MapData prepActiveRegionForProcessing(final ActiveRegion activeRegion, - final ActiveRegionWalker walker, - final IntervalReferenceOrderedView referenceOrderedDataView) { - final List stillLive = new LinkedList<>(); - for ( final GATKSAMRecord read : myReads.popCurrentReads() ) { - boolean killed = false; - final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); - - if( activeRegion.getLocation().overlapsP( readLoc ) ) { - activeRegion.add(read); - - if ( ! walker.wantsNonPrimaryReads() ) { - killed = true; - } - } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) { - activeRegion.add( read ); - } - - // if the read hasn't already been killed, check if it cannot occur in any more active regions, and maybe kill it - if ( ! killed && readCannotOccurInAnyMoreActiveRegions(read, activeRegion) ) { - killed = true; - } - - // keep track of all of the still live active regions - if ( ! killed ) stillLive.add(read); - } - myReads.addAll(stillLive); - - if ( logger.isDebugEnabled() ) { - logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive() ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReadSpanLoc()); - } - - if ( LOG_READ_CARRYING ) - logger.info(String.format("Processing region %20s span=%3d active?=%5b with %4d reads. Overall max reads carried is %s", - activeRegion.getLocation(), activeRegion.getLocation().size(), activeRegion.isActive(), activeRegion.size(), maxReadsInMemory)); - - // prepare the RefMetaDataTracker information - final GenomeLoc loc = activeRegion.getLocation(); - // get all of the RODs that cover the active region (without extension) - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataForInterval(loc); - // trim away all of the features that occurred before this location, as we will not need them in the future - referenceOrderedDataView.trimCurrentFeaturesToLoc(loc); - - return new MapData(activeRegion, tracker); - } - - private class TraverseActiveRegionMap implements NSMapFunction { - @Override - public M apply(final MapData mapData) { - if ( DEBUG ) logger.info("Executing walker.map for " + mapData.activeRegion + " in thread " + Thread.currentThread().getName()); - return walker.map(mapData.activeRegion, mapData.tracker); - } - } - - private class TraverseActiveRegionReduce implements NSReduceFunction { - @Override - public T apply(M one, T sum) { - return walker.reduce(one, sum); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseDuplicates.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseDuplicates.java deleted file mode 100644 index 6cffe9427..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseDuplicates.java +++ /dev/null @@ -1,205 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.traversals; - -import htsjdk.samtools.SAMRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.datasources.providers.ReadShardDataProvider; -import org.broadinstitute.gatk.engine.datasources.providers.ReadView; -import org.broadinstitute.gatk.engine.iterators.PushbackIterator; -import org.broadinstitute.gatk.engine.walkers.DuplicateWalker; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.*; - -/** - * @author Mark DePristo - * @version 0.1 - *

    - * Class TraverseDuplicates - *

    - * This class handles traversing lists of duplicate reads in the new shardable style - */ -public class TraverseDuplicates extends TraversalEngine,ReadShardDataProvider> { - /** our log, which we want to capture anything from this class */ - protected static Logger logger = Logger.getLogger(TraverseDuplicates.class); - - /** Turn this to true to enable logger.debug output */ - private final boolean DEBUG = false; - - @Override - public String getTraversalUnits() { - return "dups"; - } - - private List readsAtLoc(final GATKSAMRecord read, PushbackIterator iter) { - GenomeLoc site = engine.getGenomeLocParser().createGenomeLoc(read); - ArrayList l = new ArrayList(); - - l.add(read); - for (SAMRecord read2 : iter) { - GenomeLoc site2 = engine.getGenomeLocParser().createGenomeLoc(read2); - - // the next read starts too late - if (site2.getStart() != site.getStart()) { - iter.pushback(read2); - break; - } else { - l.add((GATKSAMRecord) read2); - } - } - - return l; - } - - /** - * Creates a set of lists of reads, where each list contains reads from the same underlying molecule according - * to their duplicate flag and their (and mate, if applicable) start/end positions. - * - * @param reads the list of reads to split into unique molecular samples - * @return - */ - protected Set> uniqueReadSets(List reads) { - Set> readSets = new LinkedHashSet>(); - - // for each read, find duplicates, and either add the read to its duplicate list or start a new one - for ( GATKSAMRecord read : reads ) { - List readSet = findDuplicateReads(read, readSets); - - if ( readSet == null ) { - readSets.add(new ArrayList(Arrays.asList(read))); // copy so I can add to the list - } else { - readSet.add(read); - } - } - - return readSets; - } - - /** - * Find duplicate reads for read in the set of unique reads. This is effective a duplicate marking algorithm, - * but it relies for safety's sake on the file itself being marked by a true duplicate marking algorithm. Pair - * and single-end read aware. - * - * @param read - * @param readSets - * @return The list of duplicate reads that read is a member of, or null if it's the only one of its kind - */ - protected List findDuplicateReads(GATKSAMRecord read, Set> readSets ) { - if ( read.getReadPairedFlag() ) { - // paired - final GenomeLoc readMateLoc = engine.getGenomeLocParser().createGenomeLoc(read.getMateReferenceName(), read.getMateAlignmentStart(), read.getMateAlignmentStart()); - - for (List reads : readSets) { - GATKSAMRecord key = reads.get(0); - - // read and key start at the same place, and either the this read and the key - // share a mate location or the read is flagged as a duplicate - if ( read.getAlignmentStart() == key.getAlignmentStart() && key.getReadPairedFlag() && ( key.getDuplicateReadFlag() || read.getDuplicateReadFlag() ) ) { - // at least one has to be marked as a duplicate - final GenomeLoc keyMateLoc = engine.getGenomeLocParser().createGenomeLoc(key.getMateReferenceName(), key.getMateAlignmentStart(), key.getMateAlignmentStart()); - if ( readMateLoc.compareTo(keyMateLoc) == 0 ) { - // we are at the same position as the dup and have the same mat pos, it's a dup - if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s %d %s vs. %s", read, reads.size(), readMateLoc, keyMateLoc)); - return reads; - } - } - } - } else { - for (List reads : readSets) { - GATKSAMRecord key = reads.get(0); - boolean v = (! key.getReadPairedFlag()) && read.getAlignmentStart() == key.getAlignmentStart() && ( key.getDuplicateReadFlag() || read.getDuplicateReadFlag() ) && read.getReadLength() == key.getReadLength(); - //System.out.printf("%s %s %b %b %d %d %d %d => %b%n", - // read.getReadPairedFlag(), key.getReadPairedFlag(), read.getDuplicateReadFlag(), key.getDuplicateReadFlag(), - // read.getAlignmentStart(), key.getAlignmentStart(), read.getReadLength(), key.getReadLength(), v); - if ( v ) { - //System.out.printf("Returning reads...%n"); - return reads; - } - } - } - - return null; - } - - // -------------------------------------------------------------------------------------------------------------- - // - // new style interface to the system - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * Traverse by reads, given the data and the walker - * - * @param walker the walker to execute over - * @param sum of type T, the return from the walker - * - * @return the result type T, the product of all the reduce calls - */ - public T traverse(DuplicateWalker walker, - ReadShardDataProvider dataProvider, - T sum) { - PushbackIterator iter = new PushbackIterator(new ReadView(dataProvider).iterator()); - - /** - * while we still have more reads: - * ok, here's the idea. We get all the reads that start at the same position in the genome - * We then split the list of reads into sublists of reads: - * -> those with the same mate pair position, for paired reads - * -> those flagged as unpaired and duplicated but having the same start and end - */ - boolean done = walker.isDone(); - for (SAMRecord read : iter) { - if ( done ) break; - // get the genome loc from the read - GenomeLoc site = engine.getGenomeLocParser().createGenomeLoc(read); - - Set> readSets = uniqueReadSets(readsAtLoc((GATKSAMRecord) read, iter)); - if ( DEBUG ) logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d read sets", site, readSets.size())); - - // Jump forward in the reference to this locus location - AlignmentContext locus = new AlignmentContext(site, new ReadBackedPileupImpl(site)); - - // update the number of duplicate sets we've seen - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // actually call filter and map, accumulating sum - final boolean keepMeP = walker.filter(site, locus, readSets); - if (keepMeP) { - M x = walker.map(site, locus, readSets); - sum = walker.reduce(x, sum); - } - - printProgress(site.getStopLocation()); - done = walker.isDone(); - } - - return sum; - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseLociNano.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseLociNano.java deleted file mode 100644 index 02c1a7e7f..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseLociNano.java +++ /dev/null @@ -1,304 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.traversals; - -import org.broadinstitute.gatk.engine.WalkerManager; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.datasources.providers.*; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.walkers.DataSource; -import org.broadinstitute.gatk.engine.walkers.LocusWalker; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.nanoScheduler.NSMapFunction; -import org.broadinstitute.gatk.utils.nanoScheduler.NSProgressFunction; -import org.broadinstitute.gatk.utils.nanoScheduler.NSReduceFunction; -import org.broadinstitute.gatk.utils.nanoScheduler.NanoScheduler; -import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; - -import java.util.Iterator; - -/** - * A simple solution to iterating over all reference positions over a series of genomic locations. - */ -public class TraverseLociNano extends TraversalEngine,LocusShardDataProvider> { - /** our log, which we want to capture anything from this class */ - private static final boolean DEBUG = false; - - final NanoScheduler nanoScheduler; - - public TraverseLociNano(int nThreads) { - nanoScheduler = new NanoScheduler(nThreads); - nanoScheduler.setProgressFunction(new TraverseLociProgress()); - } - - @Override - public final String getTraversalUnits() { - return "sites"; - } - - protected static class TraverseResults { - final int numIterations; - final T reduceResult; - - public TraverseResults(int numIterations, T reduceResult) { - this.numIterations = numIterations; - this.reduceResult = reduceResult; - } - } - - @Override - public T traverse( LocusWalker walker, - LocusShardDataProvider dataProvider, - T sum) { - logger.debug(String.format("TraverseLoci.traverse: Shard is %s", dataProvider)); - - final LocusView locusView = getLocusView( walker, dataProvider ); - - if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all - //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); - ReferenceOrderedView referenceOrderedDataView = null; - if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) - referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider ); - else - referenceOrderedDataView = (RodLocusView)locusView; - - final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); - - final TraverseResults result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum ); - sum = result.reduceResult; - dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations); - } - - // We have a final map call to execute here to clean up the skipped based from the - // last position in the ROD to that in the interval - if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) { - // only do this if the walker isn't done! - final RodLocusView rodLocusView = (RodLocusView)locusView; - final long nSkipped = rodLocusView.getLastSkippedBases(); - if ( nSkipped > 0 ) { - final GenomeLoc site = rodLocusView.getLocOneBeyondShard(); - final AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped); - final M x = walker.map(null, null, ac); - sum = walker.reduce(x, sum); - } - } - - return sum; - } - - /** - * Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track' - * of sorts, providing a consistent interface so that TraverseLoci doesn't need to be reimplemented for any new datatype - * that comes along. - * @param walker walker to interrogate. - * @param dataProvider Data which which to drive the locus view. - * @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal. - */ - private LocusView getLocusView( Walker walker, LocusShardDataProvider dataProvider ) { - final DataSource dataSource = WalkerManager.getWalkerDataSource(walker); - if( dataSource == DataSource.READS ) - return new CoveredLocusView(dataProvider); - else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) - return new AllLocusView(dataProvider); - else if( dataSource == DataSource.REFERENCE_ORDERED_DATA ) - return new RodLocusView(dataProvider); - else - throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource); - } - - protected TraverseResults traverse(final LocusWalker walker, - final LocusView locusView, - final LocusReferenceView referenceView, - final ReferenceOrderedView referenceOrderedDataView, - final T sum) { - nanoScheduler.setDebug(DEBUG); - final TraverseLociMap myMap = new TraverseLociMap(walker); - final TraverseLociReduce myReduce = new TraverseLociReduce(walker); - - final MapDataIterator inputIterator = new MapDataIterator(locusView, referenceView, referenceOrderedDataView); - final T result = nanoScheduler.execute(inputIterator, myMap, sum, myReduce); - - return new TraverseResults(inputIterator.numIterations, result); - } - - /** - * Create iterator that provides inputs for all map calls into MapData, to be provided - * to NanoScheduler for Map/Reduce - */ - private class MapDataIterator implements Iterator { - final LocusView locusView; - final LocusReferenceView referenceView; - final ReferenceOrderedView referenceOrderedDataView; - int numIterations = 0; - - private MapDataIterator(LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView) { - this.locusView = locusView; - this.referenceView = referenceView; - this.referenceOrderedDataView = referenceOrderedDataView; - } - - @Override - public boolean hasNext() { - return locusView.hasNext() && ! engine.exceedsRuntimeLimit(); - } - - @Override - public MapData next() { - final AlignmentContext locus = locusView.next(); - final GenomeLoc location = locus.getLocation(); - - //logger.info("Pulling data from MapDataIterator at " + location); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(location); - - numIterations++; - return new MapData(locus, refContext, tracker); - } - - @Override - public void remove() { - throw new UnsupportedOperationException("Cannot remove elements from MapDataIterator"); - } - } - - @Override - public void shutdown() { - nanoScheduler.shutdown(); - } - - /** - * The input data needed for each map call. The read, the reference, and the RODs - */ - private class MapData { - final AlignmentContext alignmentContext; - final ReferenceContext refContext; - final RefMetaDataTracker tracker; - - private MapData(final AlignmentContext alignmentContext, ReferenceContext refContext, RefMetaDataTracker tracker) { - this.alignmentContext = alignmentContext; - this.refContext = refContext; - this.tracker = tracker; - } - - @Override - public String toString() { - return "MapData " + alignmentContext.getLocation(); - } - } - - /** - * Contains the results of a map call, indicating whether the call was good, filtered, or done - */ - private class MapResult { - final M value; - final boolean reduceMe; - - /** - * Create a MapResult with value that should be reduced - * - * @param value the value to reduce - */ - private MapResult(final M value) { - this.value = value; - this.reduceMe = true; - } - - /** - * Create a MapResult that shouldn't be reduced - */ - private MapResult() { - this.value = null; - this.reduceMe = false; - } - } - - /** - * A static object that tells reduce that the result of map should be skipped (filtered or done) - */ - private final MapResult SKIP_REDUCE = new MapResult(); - - /** - * MapFunction for TraverseReads meeting NanoScheduler interface requirements - * - * Applies walker.map to MapData, returning a MapResult object containing the result - */ - private class TraverseLociMap implements NSMapFunction { - final LocusWalker walker; - - private TraverseLociMap(LocusWalker walker) { - this.walker = walker; - } - - @Override - public MapResult apply(final MapData data) { - if ( ! walker.isDone() ) { - final boolean keepMeP = walker.filter(data.tracker, data.refContext, data.alignmentContext); - if (keepMeP) { - final M x = walker.map(data.tracker, data.refContext, data.alignmentContext); - return new MapResult(x); - } - } - return SKIP_REDUCE; - } - } - - /** - * NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements - * - * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable - */ - private class TraverseLociReduce implements NSReduceFunction { - final LocusWalker walker; - - private TraverseLociReduce(LocusWalker walker) { - this.walker = walker; - } - - @Override - public T apply(MapResult one, T sum) { - if ( one.reduceMe ) - // only run reduce on values that aren't DONE or FAILED - return walker.reduce(one.value, sum); - else - return sum; - } - } - - private class TraverseLociProgress implements NSProgressFunction { - @Override - public void progress(MapData lastProcessedMap) { - if (lastProcessedMap.alignmentContext != null) - printProgress(lastProcessedMap.alignmentContext.getLocation()); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsNano.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsNano.java deleted file mode 100644 index 2ce752b1f..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsNano.java +++ /dev/null @@ -1,256 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.traversals; - -import htsjdk.samtools.SAMRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.datasources.providers.ReadBasedReferenceOrderedView; -import org.broadinstitute.gatk.engine.datasources.providers.ReadReferenceView; -import org.broadinstitute.gatk.engine.datasources.providers.ReadShardDataProvider; -import org.broadinstitute.gatk.engine.datasources.providers.ReadView; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.walkers.ReadWalker; -import org.broadinstitute.gatk.utils.nanoScheduler.NSMapFunction; -import org.broadinstitute.gatk.utils.nanoScheduler.NSProgressFunction; -import org.broadinstitute.gatk.utils.nanoScheduler.NSReduceFunction; -import org.broadinstitute.gatk.utils.nanoScheduler.NanoScheduler; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.Iterator; -import java.util.LinkedList; - -/** - * A nano-scheduling version of TraverseReads. - * - * Implements the traversal of a walker that accepts individual reads, the reference, and - * RODs per map call. Directly supports shared memory parallelism via NanoScheduler - * - * @author depristo - * @version 1.0 - * @date 9/2/2012 - */ -public class TraverseReadsNano extends TraversalEngine,ReadShardDataProvider> { - /** our log, which we want to capture anything from this class */ - private final static boolean PRE_READ_ALL_MAP_DATA = true; - protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class); - private static final boolean DEBUG = false; - final NanoScheduler nanoScheduler; - - public TraverseReadsNano(int nThreads) { - nanoScheduler = new NanoScheduler(nThreads); - nanoScheduler.setProgressFunction(new NSProgressFunction() { - @Override - public void progress(MapData lastProcessedMap) { - if ( lastProcessedMap.refContext != null ) - // note, need to use getStopLocation so we don't give an interval to ProgressMeterDaemon - printProgress(lastProcessedMap.refContext.getLocus().getStopLocation()); - } - }); - } - - @Override - public String getTraversalUnits() { - return "reads"; - } - - /** - * Traverse by reads, given the data and the walker - * - * @param walker the walker to traverse with - * @param dataProvider the provider of the reads data - * @param sum the value of type T, specified by the walker, to feed to the walkers reduce function - * @return the reduce variable of the read walker - */ - public T traverse(ReadWalker walker, - ReadShardDataProvider dataProvider, - T sum) { - if ( logger.isDebugEnabled() ) - logger.debug(String.format("TraverseReadsNano.traverse Covered dataset is %s", dataProvider)); - - if( !dataProvider.hasReads() ) - throw new IllegalArgumentException("Unable to traverse reads; no read data is available."); - - nanoScheduler.setDebug(DEBUG); - final TraverseReadsMap myMap = new TraverseReadsMap(walker); - final TraverseReadsReduce myReduce = new TraverseReadsReduce(walker); - - final Iterator aggregatedInputs = aggregateMapData(dataProvider); - final T result = nanoScheduler.execute(aggregatedInputs, myMap, sum, myReduce); - - return result; - } - - /** - * Aggregate all of the inputs for all map calls into MapData, to be provided - * to NanoScheduler for Map/Reduce - * - * @param dataProvider the source of our data - * @return a linked list of MapData objects holding the read, ref, and ROD info for every map/reduce - * should execute - */ - private Iterator aggregateMapData(final ReadShardDataProvider dataProvider) { - final Iterator it = makeDataIterator(dataProvider); - if ( PRE_READ_ALL_MAP_DATA ) { - final LinkedList l = new LinkedList(); - while ( it.hasNext() ) l.add(it.next()); - return l.iterator(); - } else { - return it; - } - } - - - private Iterator makeDataIterator(final ReadShardDataProvider dataProvider) { - return new Iterator () { - final ReadView reads = new ReadView(dataProvider); - final ReadReferenceView reference = new ReadReferenceView(dataProvider); - final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); - final Iterator readIterator = reads.iterator(); - - @Override public boolean hasNext() { return ! engine.exceedsRuntimeLimit() && readIterator.hasNext(); } - - @Override - public MapData next() { - final SAMRecord read = readIterator.next(); - final ReferenceContext refContext = ! read.getReadUnmappedFlag() - ? reference.getReferenceContext(read) - : null; - - // if the read is mapped, create a metadata tracker - final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 - ? rodView.getReferenceOrderedDataForRead(read) - : null; - - // update the number of reads we've seen - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - return new MapData((GATKSAMRecord)read, refContext, tracker); - } - - @Override public void remove() { - throw new UnsupportedOperationException("Remove not supported"); - } - }; - } - - @Override - public void shutdown() { - nanoScheduler.shutdown(); - } - - /** - * The input data needed for each map call. The read, the reference, and the RODs - */ - private class MapData { - final GATKSAMRecord read; - final ReferenceContext refContext; - final RefMetaDataTracker tracker; - - private MapData(GATKSAMRecord read, ReferenceContext refContext, RefMetaDataTracker tracker) { - this.read = read; - this.refContext = refContext; - this.tracker = tracker; - } - } - - /** - * Contains the results of a map call, indicating whether the call was good, filtered, or done - */ - private class MapResult { - final M value; - final boolean reduceMe; - - /** - * Create a MapResult with value that should be reduced - * - * @param value the value to reduce - */ - private MapResult(final M value) { - this.value = value; - this.reduceMe = true; - } - - /** - * Create a MapResult that shouldn't be reduced - */ - private MapResult() { - this.value = null; - this.reduceMe = false; - } - } - - /** - * A static object that tells reduce that the result of map should be skipped (filtered or done) - */ - private final MapResult SKIP_REDUCE = new MapResult(); - - /** - * MapFunction for TraverseReads meeting NanoScheduler interface requirements - * - * Applies walker.map to MapData, returning a MapResult object containing the result - */ - private class TraverseReadsMap implements NSMapFunction { - final ReadWalker walker; - - private TraverseReadsMap(ReadWalker walker) { - this.walker = walker; - } - - @Override - public MapResult apply(final MapData data) { - if ( ! walker.isDone() ) { - final boolean keepMeP = walker.filter(data.refContext, data.read); - if (keepMeP) - return new MapResult(walker.map(data.refContext, data.read, data.tracker)); - } - - return SKIP_REDUCE; - } - } - - /** - * NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements - * - * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable - */ - private class TraverseReadsReduce implements NSReduceFunction { - final ReadWalker walker; - - private TraverseReadsReduce(ReadWalker walker) { - this.walker = walker; - } - - @Override - public T apply(MapResult one, T sum) { - if ( one.reduceMe ) - // only run reduce on values that aren't DONE or FAILED - return walker.reduce(one.value, sum); - else - return sum; - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionTraversalParameters.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionTraversalParameters.java deleted file mode 100644 index 7c428cd6a..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionTraversalParameters.java +++ /dev/null @@ -1,97 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers; - -import org.broadinstitute.gatk.utils.activeregion.BandPassActivityProfile; - -import java.lang.annotation.Documented; -import java.lang.annotation.Inherited; -import java.lang.annotation.Retention; -import java.lang.annotation.RetentionPolicy; - -/** - * Describes the parameters that this walker requires of the active region traversal - * - * User: rpoplin - * Date: 1/18/12 - */ -@Documented -@Inherited -@Retention(RetentionPolicy.RUNTIME) - -public @interface ActiveRegionTraversalParameters { - /** - * How far to either side of the active region itself should we include reads? - * - * That is, if the active region is 10 bp wide, and extension is 5, ART will provide - * the walker with active regions 10 bp, with 5 bp of extension on either side, and - * all reads that cover the 20 bp of the region + extension. - * - * @return the size of the active region extension we'd like - */ - public int extension() default 0; - - /** - * The minimum number of bp for an active region, when we need to chop it up into pieces because - * it's become too big. This only comes into effect when there's literally no good place to chop - * that does make the region smaller than this value. - * - * @return the min size in bp of regions - */ - public int minRegion() default 50; - - /** - * The maximum size in bp of active regions wanted by this walker - * - * Active regions larger than this value are automatically cut up by ART into smaller - * regions of size <= this value. - * - * @return the max size in bp of regions - */ - public int maxRegion() default 1500; - - /** - * The variance value for the Gaussian kernel of the band pass filter employed by ART - * @return the breadth of the band pass gaussian kernel we want for our traversal - */ - public double bandPassSigma() default BandPassActivityProfile.DEFAULT_SIGMA; - - /** - * What is the maximum number of reads we're willing to hold in memory per sample - * during the traversal? This limits our exposure to unusually large amounts - * of coverage in the engine. - * @return the maximum number of reads we're willing to hold in memory - */ - public int maxReadsToHoldInMemoryPerSample() default 3000; - - /** - * No matter what the per sample value says, we will never hold more than this - * number of reads in memory at any time. Provides an upper bound on the total number - * of reads in the case where we have a lot of samples. - * @return the maximum number of reads to hold in memory - */ - public int maxReadsToHoldTotal() default 1000000; -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionWalker.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionWalker.java deleted file mode 100644 index 9ff68bc9e..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionWalker.java +++ /dev/null @@ -1,196 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers; - -import com.google.java.contract.Ensures; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import htsjdk.tribble.Feature; -import org.broadinstitute.gatk.utils.commandline.*; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; -import org.broadinstitute.gatk.engine.filters.*; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.GenomeLocSortedSet; -import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; -import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState; -import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState; -import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; -import org.broadinstitute.gatk.utils.interval.IntervalSetRule; -import org.broadinstitute.gatk.utils.interval.IntervalUtils; - -import java.io.PrintStream; -import java.util.*; - -/** - * Base class for all the Active Region Walkers. - * User: rpoplin - * Date: 12/7/11 - */ - -@By(DataSource.READS) -@Requires({DataSource.READS, DataSource.REFERENCE}) -@PartitionBy(PartitionType.READ) -@ActiveRegionTraversalParameters(extension=50,maxRegion=1500) -@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, MappingQualityUnavailableFilter.class}) -@Downsample(by = DownsampleType.BY_SAMPLE, toCoverage = 1000) -@RemoveProgramRecords -public abstract class ActiveRegionWalker extends Walker { - /** - * If provided, this walker will write out its activity profile (per bp probabilities of being active) - * to this file in the IGV formatted TAB deliminated output: - * - * http://www.broadinstitute.org/software/igv/IGV - * - * Intended to make debugging the activity profile calculations easier - */ - @Output(fullName="activityProfileOut", shortName="APO", doc="Output the raw activity profile results in IGV format", required = false, defaultToStdout = false) - public PrintStream activityProfileOutStream = null; - - /** - * If provided, this walker will write out its active and inactive regions - * to this file in the IGV formatted TAB deliminated output: - * - * http://www.broadinstitute.org/software/igv/IGV - * - * Intended to make debugging the active region calculations easier - */ - @Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this IGV formatted file", required = false, defaultToStdout = false) - public PrintStream activeRegionOutStream = null; - - @Advanced - @Input(fullName="activeRegionIn", shortName="AR", doc="Use this interval list file as the active regions to process", required = false) - protected List> activeRegionBindings = null; - - @Advanced - @Argument(fullName="activeRegionExtension", shortName="activeRegionExtension", doc="The active region extension; if not provided defaults to Walker annotated default", required = false) - public Integer activeRegionExtension = null; - - /** - * For the active region walker to treat all bases as active. Useful for debugging when you want to force something like - * the HaplotypeCaller to process a specific interval you provide the GATK - */ - @Advanced - @Argument(fullName="forceActive", shortName="forceActive", doc="If provided, all bases will be tagged as active", required = false) - public boolean forceActive = false; - - @Advanced - @Argument(fullName="activeRegionMaxSize", shortName="activeRegionMaxSize", doc="The active region maximum size; if not provided defaults to Walker annotated default", required = false) - public Integer activeRegionMaxSize = null; - - @Advanced - @Argument(fullName="bandPassSigma", shortName="bandPassSigma", doc="The sigma of the band pass filter Gaussian kernel; if not provided defaults to Walker annotated default", required = false) - public Double bandPassSigma = null; - - /* - * For active region limits in ActivityProfile -* */ - @Hidden - @Argument(fullName = "maxProbPropagationDistance", shortName = "maxProbPropDist", minValue = 0, doc="Region probability propagation distance beyond it's maximum size.", required = false) - public Integer maxProbPropagationDistance = 50; - - @Advanced - @Argument(fullName = "activeProbabilityThreshold", shortName = "ActProbThresh", minValue = 0.0, maxValue = 1.0, doc="Threshold for the probability of a profile state being active.", required = false) - public Double activeProbThreshold = 0.002; - - private GenomeLocSortedSet presetActiveRegions = null; - - @Override - public void initialize() { - if( activeRegionBindings == null ) { return; } - List allIntervals = new ArrayList(0); - for ( IntervalBinding intervalBinding : activeRegionBindings ) { - List intervals = intervalBinding.getIntervals(this.getToolkit()); - - if ( intervals.isEmpty() ) { - logger.warn("The interval file " + intervalBinding.getSource() + " contains no intervals that could be parsed."); - } - - allIntervals = IntervalUtils.mergeListsBySetOperator(intervals, allIntervals, IntervalSetRule.UNION); - } - - presetActiveRegions = IntervalUtils.sortAndMergeIntervals(this.getToolkit().getGenomeLocParser(), allIntervals, IntervalMergingRule.ALL); - } - - /** - * Does this walker want us to use a set of preset action regions instead of dynamically using the result of isActive? - * @return true if yes, false if no - */ - public boolean hasPresetActiveRegions() { - return presetActiveRegions != null; - } - - /** - * Get the set of preset active regions, or null if none were provided - * @return a set of genome locs specifying fixed active regions requested by the walker, or null if none exist - */ - public GenomeLocSortedSet getPresetActiveRegions() { - return presetActiveRegions; - } - - // Do we actually want to operate on the context? - public boolean filter(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { - return true; // We are keeping all the reads - } - - public EnumSet desiredReadStates() { - return EnumSet.of(ActiveRegionReadState.PRIMARY); - } - - public final boolean wantsNonPrimaryReads() { - return desiredReadStates().contains(ActiveRegionReadState.NONPRIMARY); - } - - public boolean wantsExtendedReads() { - return desiredReadStates().contains(ActiveRegionReadState.EXTENDED); - } - - public boolean wantsUnmappedReads() { - return desiredReadStates().contains(ActiveRegionReadState.UNMAPPED); - } - - // Determine probability of active status over the AlignmentContext - @Ensures({"result.isActiveProb >= 0.0", "result.isActiveProb <= 1.0"}) - public abstract ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context); - - // Map over the ActiveRegion - public abstract MapType map(final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker); - - public final GenomeLocSortedSet extendIntervals( final GenomeLocSortedSet intervals, final GenomeLocParser genomeLocParser, IndexedFastaSequenceFile reference ) { - final int activeRegionExtension = this.getClass().getAnnotation(ActiveRegionTraversalParameters.class).extension(); - final List allIntervals = new ArrayList(); - for( final GenomeLoc interval : intervals.toList() ) { - final int start = Math.max( 1, interval.getStart() - activeRegionExtension ); - final int stop = Math.min( reference.getSequenceDictionary().getSequence(interval.getContig()).getSequenceLength(), interval.getStop() + activeRegionExtension ); - allIntervals.add( genomeLocParser.createGenomeLoc(interval.getContig(), start, stop) ); - } - return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, IntervalMergingRule.ALL); - } - - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Downsample.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Downsample.java deleted file mode 100644 index c112d7d26..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Downsample.java +++ /dev/null @@ -1,47 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers; - -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; - -import java.lang.annotation.*; - -/** - * Specifies a method for downsampling the reads passed to a given - * walker based on the input from that walker. - * - * @author hanna - * @version 0.1 - */ -@Documented -@Inherited -@Retention(RetentionPolicy.RUNTIME) -@Target(ElementType.TYPE) -public @interface Downsample { - DownsampleType by(); - int toCoverage() default -1; - double toFraction() default -1.0F; -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/DuplicateWalker.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/DuplicateWalker.java deleted file mode 100644 index 96d2d5dad..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/DuplicateWalker.java +++ /dev/null @@ -1,57 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers; - -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.filters.NotPrimaryAlignmentFilter; -import org.broadinstitute.gatk.engine.filters.UnmappedReadFilter; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.List; -import java.util.Set; - -/** - * Created by IntelliJ IDEA. - * User: mdepristo - * Date: Feb 22, 2009 - * Time: 2:52:28 PM - * To change this template use File | Settings | File Templates. - */ -@Requires({DataSource.READS,DataSource.REFERENCE}) -@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class}) -public abstract class DuplicateWalker extends Walker { - // Do we actually want to operate on the context? - public boolean filter(GenomeLoc loc, AlignmentContext context, Set> readSets ) { - return true; // We are keeping all the reads - } - - public abstract MapType map(GenomeLoc loc, AlignmentContext context, Set> readSets ); - - // Given result of map function - public abstract ReduceType reduceInit(); - public abstract ReduceType reduce(MapType value, ReduceType sum); -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/LocusWalker.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/LocusWalker.java deleted file mode 100644 index 1e7b0e54c..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/LocusWalker.java +++ /dev/null @@ -1,58 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers; - -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; -import org.broadinstitute.gatk.engine.filters.DuplicateReadFilter; -import org.broadinstitute.gatk.engine.filters.FailsVendorQualityCheckFilter; -import org.broadinstitute.gatk.engine.filters.NotPrimaryAlignmentFilter; -import org.broadinstitute.gatk.engine.filters.UnmappedReadFilter; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; - -/** - * Created by IntelliJ IDEA. - * User: mdepristo - * Date: Feb 22, 2009 - * Time: 2:52:28 PM - * To change this template use File | Settings | File Templates. - */ -@By(DataSource.READS) -@Requires({DataSource.READS,DataSource.REFERENCE}) -@PartitionBy(PartitionType.LOCUS) -@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class}) -@Downsample(by = DownsampleType.BY_SAMPLE, toCoverage = 1000) -@RemoveProgramRecords -public abstract class LocusWalker extends Walker { - // Do we actually want to operate on the context? - public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - return true; // We are keeping all the reads - } - - // Map over the org.broadinstitute.gatk.engine.contexts.AlignmentContext - public abstract MapType map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context); -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Multiplex.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Multiplex.java deleted file mode 100644 index e771d1ed8..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Multiplex.java +++ /dev/null @@ -1,44 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers; - -import java.lang.annotation.*; - -/** - * Indicates that the class should be multiplexed according to the rules - * specified in the multiplexer. - * - * @author mhanna - * @version 0.1 - */ -@Documented -@Inherited -@Retention(RetentionPolicy.RUNTIME) -@Target({ElementType.FIELD}) -public @interface Multiplex { - public Class value(); - public String[] arguments() default {}; -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Multiplexer.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Multiplexer.java deleted file mode 100644 index 969e288a5..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Multiplexer.java +++ /dev/null @@ -1,52 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers; - -import java.util.Collection; - -/** - * An interface for multiplexing output streams. - * - * @author mhanna - * @version 0.1 - */ -public interface Multiplexer { - /** - * Generate a list of the potential outputs that can be created as a function of the other - * command-line arguments in this class. - * @return A collection of unique identifiers for the file multiplex. - */ - public Collection multiplex(); - - /** - * Transform the given command-line argument into a suitable form specific to this filename. - * @param multiplexedEntry Identifies the individual component of the multiplex. Will be a value in the collection - * passed back by multiplex(). - * @param argument The actual command-line argument, supplied for transformation. - * @return A transformed representation of the command-line argument. - */ - public String transformArgument(final T multiplexedEntry, final String argument); -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/PartitionBy.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/PartitionBy.java deleted file mode 100644 index 346f7c40e..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/PartitionBy.java +++ /dev/null @@ -1,39 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers; - -import java.lang.annotation.*; - -/** - * Allows the walker to indicate how to partition data it wants to consume. - */ -@Documented -@Inherited -@Retention(RetentionPolicy.RUNTIME) -@Target(ElementType.TYPE) -public @interface PartitionBy { - PartitionType value(); -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadWalker.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadWalker.java deleted file mode 100644 index 9528cf18e..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadWalker.java +++ /dev/null @@ -1,55 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers; - -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -/** - * Created by IntelliJ IDEA. - * User: mdepristo - * Date: Feb 22, 2009 - * Time: 2:52:28 PM - * To change this template use File | Settings | File Templates. - */ -@Requires({DataSource.READS, DataSource.REFERENCE}) -@PartitionBy(PartitionType.READ) -public abstract class ReadWalker extends Walker { - public boolean requiresOrderedReads() { return false; } - - // Do we actually want to operate on the context? - /** Must return true for reads that need to be processed. Reads, for which this method return false will - * be skipped by the engine and never passed to the walker. - */ - public boolean filter(ReferenceContext ref, GATKSAMRecord read) { - // We are keeping all the reads - return true; - } - - // Map over the org.broadinstitute.gatk.engine.contexts.AlignmentContext - public abstract MapType map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker); -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Walker.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Walker.java deleted file mode 100644 index 31472fdfd..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Walker.java +++ /dev/null @@ -1,177 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers; - -import htsjdk.samtools.SAMSequenceDictionary; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; -import org.broadinstitute.gatk.engine.filters.MalformedReadFilter; -import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.samples.Sample; -import org.broadinstitute.gatk.engine.samples.SampleDB; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.baq.BAQ; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; -import org.broadinstitute.gatk.utils.recalibration.BQSRMode; - -import java.util.List; - -/** - * Created by IntelliJ IDEA. - * User: hanna - * Date: Mar 17, 2009 - * Time: 1:53:31 PM - * To change this template use File | Settings | File Templates. - */ -@ReadFilters(MalformedReadFilter.class) -@PartitionBy(PartitionType.NONE) -@Downsample(by = DownsampleType.NONE) -@BAQMode(QualityMode = BAQ.QualityMode.OVERWRITE_QUALS, ApplicationTime = ReadTransformer.ApplicationTime.ON_INPUT) -@BQSRMode(ApplicationTime = ReadTransformer.ApplicationTime.ON_INPUT) -@DocumentedGATKFeature(groupName = "Uncategorized", extraDocs = {CommandLineGATK.class}) -public abstract class Walker { - final protected static Logger logger = Logger.getLogger(Walker.class); - private GenomeAnalysisEngine toolkit; - - protected Walker() { - } - - /** - * Set the toolkit, for peering into internal structures that can't - * otherwise be read. - * @param toolkit The genome analysis toolkit. - */ - public void setToolkit(GenomeAnalysisEngine toolkit) { - this.toolkit = toolkit; - } - - /** - * Retrieve the toolkit, for peering into internal structures that can't - * otherwise be read. Use sparingly, and discuss uses with software engineering - * team. - * @return The genome analysis toolkit. - */ - protected GenomeAnalysisEngine getToolkit() { - return toolkit; - } - - /** - * Gets the master sequence dictionary for this walker - * @link GenomeAnalysisEngine.getMasterSequenceDictionary - * @return - */ - protected SAMSequenceDictionary getMasterSequenceDictionary() { - return getToolkit().getMasterSequenceDictionary(); - } - - public SampleDB getSampleDB() { - return getToolkit().getSampleDB(); - } - - protected Sample getSample(final String id) { - return getToolkit().getSampleDB().getSample(id); - } - - /** - * (conceptual static) method that states whether you want to see reads piling up at a locus - * that contain a deletion at the locus. - * - * ref: ATCTGA - * read1: ATCTGA - * read2: AT--GA - * - * Normally, the locus iterator only returns a list of read1 at this locus at position 3, but - * if this function returns true, then the system will return (read1, read2) with offsets - * of (3, -1). The -1 offset indicates a deletion in the read. - * - * @return false if you don't want to see deletions, or true if you do - */ - public boolean includeReadsWithDeletionAtLoci() { - return false; - } - - public void initialize() { } - - /** - * A function for overloading in subclasses providing a mechanism to abort early from a walker. - * - * If this ever returns true, then the Traversal engine will stop executing map calls - * and start the process of shutting down the walker in an orderly fashion. - * @return - */ - public boolean isDone() { - return false; - } - - /** - * Provide an initial value for reduce computations. - * @return Initial value of reduce. - */ - public abstract ReduceType reduceInit(); - - /** - * Reduces a single map with the accumulator provided as the ReduceType. - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return accumulator with result of the map taken into account. - */ - public abstract ReduceType reduce(MapType value, ReduceType sum); - - public void onTraversalDone(ReduceType result) { - logger.info("[REDUCE RESULT] Traversal result is: " + result); - } - - /** - * General interval reduce routine called after all of the traversals are done - * @param results interval reduce results - */ - public void onTraversalDone(List> results) { - for ( Pair result : results ) { - logger.info(String.format("[INTERVAL REDUCE RESULT] at %s ", result.getFirst())); - this.onTraversalDone(result.getSecond()); - } - } - - /** - * Return true if your walker wants to reduce each interval separately. Default is false. - * - * If you set this flag, several things will happen. - * - * The system will invoke reduceInit() once for each interval being processed, starting a fresh reduce - * Reduce will accumulate normally at each map unit in the interval - * However, onTraversalDone(reduce) will be called after each interval is processed. - * The system will call onTraversalDone( GenomeLoc -> reduce ), after all reductions are done, - * which is overloaded here to call onTraversalDone(reduce) for each location - * - * @return true if your walker wants to reduce each interval separately. - */ - public boolean isReduceByInterval() { - return false; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/BAMDiffableReader.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/BAMDiffableReader.java deleted file mode 100644 index 2c8cc7ae1..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/BAMDiffableReader.java +++ /dev/null @@ -1,119 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - -import htsjdk.samtools.SAMFileReader; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.SAMRecordIterator; -import htsjdk.samtools.ValidationStringency; -import htsjdk.samtools.util.BlockCompressedInputStream; - -import java.io.*; -import java.util.Arrays; - - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 7/4/11 - * Time: 1:09 PM - * - * Class implementing diffnode reader for VCF - */ -public class BAMDiffableReader implements DiffableReader { - @Override - public String getName() { return "BAM"; } - - @Override - public DiffElement readFromFile(File file, int maxElementsToRead) { - final SAMFileReader reader = new SAMFileReader(file, null); // null because we don't want it to look for the index - reader.setValidationStringency(ValidationStringency.SILENT); - - DiffNode root = DiffNode.rooted(file.getName()); - SAMRecordIterator iterator = reader.iterator(); - - int count = 0; - while ( iterator.hasNext() ) { - final SAMRecord record = iterator.next(); - - // name is the read name + first of pair - String name = record.getReadName().replace('.', '_'); - if ( record.getReadPairedFlag() ) { - name += record.getFirstOfPairFlag() ? "_1" : "_2"; - } - - DiffNode readRoot = DiffNode.empty(name, root); - - // add fields - readRoot.add("NAME", record.getReadName()); - readRoot.add("FLAGS", record.getFlags()); - readRoot.add("RNAME", record.getReferenceName()); - readRoot.add("POS", record.getAlignmentStart()); - readRoot.add("MAPQ", record.getMappingQuality()); - readRoot.add("CIGAR", record.getCigarString()); - readRoot.add("RNEXT", record.getMateReferenceName()); - readRoot.add("PNEXT", record.getMateAlignmentStart()); - readRoot.add("TLEN", record.getInferredInsertSize()); - readRoot.add("SEQ", record.getReadString()); - readRoot.add("QUAL", record.getBaseQualityString()); - - for ( SAMRecord.SAMTagAndValue xt : record.getAttributes() ) { - readRoot.add(xt.tag, xt.value); - } - - // add record to root - if ( ! root.hasElement(name) ) - // protect ourselves from malformed files - root.add(readRoot); - count += readRoot.size(); - if ( count > maxElementsToRead && maxElementsToRead != -1) - break; - } - - reader.close(); - - return root.getBinding(); - } - - @Override - public boolean canRead(File file) { - final byte[] BAM_MAGIC = "BAM\1".getBytes(); - final byte[] buffer = new byte[BAM_MAGIC.length]; - try { - InputStream fstream = new BufferedInputStream(new FileInputStream(file)); - if ( !BlockCompressedInputStream.isValidFile(fstream) ) - return false; - final BlockCompressedInputStream BCIS = new BlockCompressedInputStream(fstream); - BCIS.read(buffer, 0, BAM_MAGIC.length); - BCIS.close(); - return Arrays.equals(buffer, BAM_MAGIC); - } catch ( IOException e ) { - return false; - } catch ( htsjdk.samtools.FileTruncatedException e ) { - return false; - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffElement.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffElement.java deleted file mode 100644 index ebed91470..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffElement.java +++ /dev/null @@ -1,125 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; -import com.google.java.contract.Requires; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 7/4/11 - * Time: 12:55 PM - * - * An interface that must be implemented to allow us to calculate differences - * between structured objects - */ -@Invariant({ - "name != null", - "value != null", - "parent != null || name.equals(\"ROOT\")", - "value == null || value.getBinding() == this"}) -public class DiffElement { - public final static DiffElement ROOT = new DiffElement(); - - final private String name; - final private DiffElement parent; - final private DiffValue value; - - /** - * For ROOT only - */ - private DiffElement() { - this.name = "ROOT"; - this.parent = null; - this.value = new DiffValue(this, "ROOT"); - } - - @Requires({"name != null", "parent != null", "value != null"}) - public DiffElement(String name, DiffElement parent, DiffValue value) { - if ( name.equals("ROOT") ) throw new IllegalArgumentException("Cannot use reserved name ROOT"); - this.name = name; - this.parent = parent; - this.value = value; - this.value.setBinding(this); - } - - @Ensures({"result != null"}) - public String getName() { - return name; - } - - public DiffElement getParent() { - return parent; - } - - @Ensures({"result != null"}) - public DiffValue getValue() { - return value; - } - - public boolean isRoot() { return this == ROOT; } - - @Ensures({"result != null"}) - @Override - public String toString() { - return getName() + "=" + getValue().toString(); - } - - public String toString(int offset) { - return (offset > 0 ? Utils.dupString(' ', offset) : 0) + getName() + "=" + getValue().toString(offset); - } - - @Ensures({"result != null"}) - public final String fullyQualifiedName() { - if ( isRoot() ) - return ""; - else if ( parent.isRoot() ) - return name; - else - return parent.fullyQualifiedName() + "." + name; - } - - @Ensures({"result != null"}) - public String toOneLineString() { - return getName() + "=" + getValue().toOneLineString(); - } - - @Ensures({"result != null"}) - public DiffNode getValueAsNode() { - if ( getValue().isCompound() ) - return (DiffNode)getValue(); - else - throw new ReviewedGATKException("Illegal request conversion of a DiffValue into a DiffNode: " + this); - } - - public int size() { - return 1 + getValue().size(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffEngine.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffEngine.java deleted file mode 100644 index d10cfea8a..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffEngine.java +++ /dev/null @@ -1,437 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.report.GATKReport; -import org.broadinstitute.gatk.engine.report.GATKReportTable; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.classloader.PluginManager; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.io.File; -import java.io.PrintStream; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 7/4/11 - * Time: 12:51 PM - * A generic engine for comparing tree-structured objects - * - */ -public class DiffEngine { - final protected static Logger logger = Logger.getLogger(DiffEngine.class); - - private final Map readers = new HashMap(); - - public DiffEngine() { - loadDiffableReaders(); - } - - // -------------------------------------------------------------------------------- - // - // difference calculation - // - // -------------------------------------------------------------------------------- - - public List diff(DiffElement master, DiffElement test) { - DiffValue masterValue = master.getValue(); - DiffValue testValue = test.getValue(); - - if ( masterValue.isCompound() && masterValue.isCompound() ) { - return diff(master.getValueAsNode(), test.getValueAsNode()); - } else if ( masterValue.isAtomic() && testValue.isAtomic() ) { - return diff(masterValue, testValue); - } else { - // structural difference in types. one is node, other is leaf - return Arrays.asList(new Difference(master, test)); - } - } - - public List diff(DiffNode master, DiffNode test) { - Set allNames = new HashSet(master.getElementNames()); - allNames.addAll(test.getElementNames()); - List diffs = new ArrayList(); - - for ( String name : allNames ) { - DiffElement masterElt = master.getElement(name); - DiffElement testElt = test.getElement(name); - if ( masterElt == null && testElt == null ) { - throw new ReviewedGATKException("BUG: unexpectedly got two null elements for field: " + name); - } else if ( masterElt == null || testElt == null ) { // if either is null, we are missing a value - // todo -- should one of these be a special MISSING item? - diffs.add(new Difference(masterElt, testElt)); - } else { - diffs.addAll(diff(masterElt, testElt)); - } - } - - return diffs; - } - - public List diff(DiffValue master, DiffValue test) { - if ( master.getValue().equals(test.getValue()) ) { - return Collections.emptyList(); - } else { - return Arrays.asList(new Difference(master.getBinding(), test.getBinding())); - } - } - - // -------------------------------------------------------------------------------- - // - // Summarizing differences - // - // -------------------------------------------------------------------------------- - - /** - * Emits a summary of the diffs to out. Suppose you have the following three differences: - * - * A.X.Z:1!=2 - * A.Y.Z:3!=4 - * B.X.Z:5!=6 - * - * The above is the itemized list of the differences. The summary looks for common differences - * in the name hierarchy, counts those shared elements, and emits the differences that occur - * in order of decreasing counts. - * - * So, in the above example, what are the shared elements? - * - * A.X.Z and B.X.Z share X.Z, so there's a *.X.Z with count 2 - * A.X.Z, A.Y.Z, and B.X.Z all share *.*.Z, with count 3 - * Each of A.X.Z, A.Y.Z, and B.X.Z are individually unique, with count 1 - * - * So we would emit the following summary: - * - * *.*.Z: 3 - * *.X.Z: 2 - * A.X.Z: 1 [specific difference: 1!=2] - * A.Y.Z: 1 [specific difference: 3!=4] - * B.X.Z: 1 [specific difference: 5!=6] - * - * The algorithm to accomplish this calculation is relatively simple. Start with all of the - * concrete differences. For each pair of differences A1.A2....AN and B1.B2....BN: - * - * find the longest common subsequence Si.Si+1...SN where Ai = Bi = Si - * If i == 0, then there's no shared substructure - * If i > 0, then generate the summarized value X = *.*...Si.Si+1...SN - * if X is a known summary, increment it's count, otherwise set its count to 1 - * - * Not that only pairs of the same length are considered as potentially equivalent - * - * @param params determines how we display the items - * @param diffs the list of differences to summarize - */ - public void reportSummarizedDifferences(List diffs, SummaryReportParams params ) { - printSummaryReport(summarizedDifferencesOfPaths(diffs, params.doPairwise, params.maxRawDiffsToSummarize), params ); - } - - final protected static String[] diffNameToPath(String diffName) { - return diffName.split("\\."); - } - - protected List summarizedDifferencesOfPathsFromString(List singletonDiffs) { - List diffs = new ArrayList(); - - for ( String diff : singletonDiffs ) { - diffs.add(new Difference(diff)); - } - - return summarizedDifferencesOfPaths(diffs, true, -1); - } - - /** - * Computes a minimum set of potential differences between all singleton differences - * in singletonDiffs. Employs an expensive pairwise O(n^2) algorithm. - * - * @param singletonDiffs - * @param maxRawDiffsToSummarize - * @return - */ - private Map initialPairwiseSummaries(final List singletonDiffs, - final int maxRawDiffsToSummarize) { - Map summaries = new HashMap(); - - // create the initial set of differences - for ( int i = 0; i < singletonDiffs.size(); i++ ) { - for ( int j = 0; j <= i; j++ ) { - Difference diffPath1 = singletonDiffs.get(i); - Difference diffPath2 = singletonDiffs.get(j); - if ( diffPath1.length() == diffPath2.length() ) { - int lcp = longestCommonPostfix(diffPath1.getParts(), diffPath2.getParts()); - String path = diffPath2.getPath(); - if ( lcp != 0 && lcp != diffPath1.length() ) - path = summarizedPath(diffPath2.getParts(), lcp); - Difference sumDiff = new Difference(path, diffPath2.getMaster(), diffPath2.getTest()); - sumDiff.setCount(0); - addSummaryIfMissing(summaries, sumDiff); - - if ( maxRawDiffsToSummarize != -1 && summaries.size() > maxRawDiffsToSummarize) - return summaries; - } - } - } - - return summaries; - } - - /** - * Computes the possible leaf differences among the singleton diffs. - * - * The leaf differences are all of the form *.*...*.X where all internal - * differences are wildcards and the only summarized difference considered - * interesting to compute is - * - * @param singletonDiffs - * @param maxRawDiffsToSummarize - * @return - */ - private Map initialLeafSummaries(final List singletonDiffs, - final int maxRawDiffsToSummarize) { - Map summaries = new HashMap(); - - // create the initial set of differences - for ( final Difference d : singletonDiffs ) { - final String path = summarizedPath(d.getParts(), 1); - Difference sumDiff = new Difference(path, d.getMaster(), d.getTest()); - sumDiff.setCount(0); - addSummaryIfMissing(summaries, sumDiff); - - if ( maxRawDiffsToSummarize != -1 && summaries.size() > maxRawDiffsToSummarize) - return summaries; - } - - return summaries; - } - - protected List summarizedDifferencesOfPaths(final List singletonDiffs, - final boolean doPairwise, - final int maxRawDiffsToSummarize) { - final Map summaries = doPairwise - ? initialPairwiseSummaries(singletonDiffs, maxRawDiffsToSummarize) - : initialLeafSummaries(singletonDiffs, maxRawDiffsToSummarize); - - // count differences - for ( Difference diffPath : singletonDiffs ) { - for ( Difference sumDiff : summaries.values() ) { - if ( sumDiff.matches(diffPath.getParts()) ) - sumDiff.incCount(); - } - } - - List sortedSummaries = new ArrayList(summaries.values()); - Collections.sort(sortedSummaries); - return sortedSummaries; - } - - protected void addSummaryIfMissing(Map summaries, Difference diff) { - if ( ! summaries.containsKey(diff.getPath()) ) { - summaries.put(diff.getPath(), diff); - } - } - - protected void printSummaryReport(List sortedSummaries, SummaryReportParams params ) { - List toShow = new ArrayList(); - int count = 0, count1 = 0; - for ( Difference diff : sortedSummaries ) { - if ( diff.getCount() < params.minSumDiffToShow ) - // in order, so break as soon as the count is too low - break; - - if ( params.maxItemsToDisplay != 0 && count++ > params.maxItemsToDisplay ) - break; - - if ( diff.getCount() == 1 ) { - count1++; - if ( params.maxCountOneItems != 0 && count1 > params.maxCountOneItems ) - break; - } - - toShow.add(diff); - } - - // if we want it in descending order, reverse the list - if ( ! params.descending ) { - Collections.reverse(toShow); - } - - // now that we have a specific list of values we want to show, display them - GATKReport report = new GATKReport(); - final String tableName = "differences"; - report.addTable(tableName, "Summarized differences between the master and test files. See http://www.broadinstitute.org/gatk/guide/article?id=1299 for more information", 3); - final GATKReportTable table = report.getTable(tableName); - table.addColumn("Difference"); - table.addColumn("NumberOfOccurrences"); - table.addColumn("ExampleDifference"); - for ( final Difference diff : toShow ) { - final String key = diff.getPath(); - table.addRowID(key, true); - table.set(key, "NumberOfOccurrences", diff.getCount()); - table.set(key, "ExampleDifference", diff.valueDiffString()); - } - GATKReport output = new GATKReport(table); - output.print(params.out); - } - - protected static int longestCommonPostfix(String[] diffPath1, String[] diffPath2) { - int i = 0; - for ( ; i < diffPath1.length; i++ ) { - int j = diffPath1.length - i - 1; - if ( ! diffPath1[j].equals(diffPath2[j]) ) - break; - } - return i; - } - - /** - * parts is [A B C D] - * commonPostfixLength: how many parts are shared at the end, suppose its 2 - * We want to create a string *.*.C.D - * - * @param parts the separated path values [above without .] - * @param commonPostfixLength - * @return - */ - protected static String summarizedPath(String[] parts, int commonPostfixLength) { - int stop = parts.length - commonPostfixLength; - if ( stop > 0 ) parts = parts.clone(); - for ( int i = 0; i < stop; i++ ) { - parts[i] = "*"; - } - return Utils.join(".", parts); - } - - // -------------------------------------------------------------------------------- - // - // plugin manager - // - // -------------------------------------------------------------------------------- - - public void loadDiffableReaders() { - List> drClasses = new PluginManager( DiffableReader.class ).getPlugins(); - - logger.info("Loading diffable modules:"); - for (Class drClass : drClasses ) { - logger.info("\t" + drClass.getSimpleName()); - - try { - DiffableReader dr = drClass.newInstance(); - readers.put(dr.getName(), dr); - } catch (InstantiationException e) { - throw new ReviewedGATKException("Unable to instantiate module '" + drClass.getSimpleName() + "'"); - } catch (IllegalAccessException e) { - throw new ReviewedGATKException("Illegal access error when trying to instantiate '" + drClass.getSimpleName() + "'"); - } - } - } - - protected Map getReaders() { - return readers; - } - - protected DiffableReader getReader(String name) { - return readers.get(name); - } - - /** - * Returns a reader appropriate for this file, or null if no such reader exists - * @param file - * @return - */ - public DiffableReader findReaderForFile(File file) { - for ( DiffableReader reader : readers.values() ) - if (reader.canRead(file) ) - return reader; - - return null; - } - - /** - * Returns true if reader appropriate for this file, or false if no such reader exists - * @param file - * @return - */ - public boolean canRead(File file) { - return findReaderForFile(file) != null; - } - - - public DiffElement createDiffableFromFile(File file) { - return createDiffableFromFile(file, -1); - } - - public DiffElement createDiffableFromFile(File file, int maxElementsToRead) { - DiffableReader reader = findReaderForFile(file); - if ( reader == null ) - throw new UserException("Unsupported file type: " + file); - else - return reader.readFromFile(file, maxElementsToRead); - } - - public static boolean simpleDiffFiles(File masterFile, File testFile, int maxElementsToRead, DiffEngine.SummaryReportParams params) { - DiffEngine diffEngine = new DiffEngine(); - - if ( diffEngine.canRead(masterFile) && diffEngine.canRead(testFile) ) { - DiffElement master = diffEngine.createDiffableFromFile(masterFile, maxElementsToRead); - DiffElement test = diffEngine.createDiffableFromFile(testFile, maxElementsToRead); - List diffs = diffEngine.diff(master, test); - diffEngine.reportSummarizedDifferences(diffs, params); - return true; - } else { - return false; - } - } - - public static class SummaryReportParams { - final PrintStream out; - final int maxItemsToDisplay; - final int maxCountOneItems; - final int minSumDiffToShow; - final int maxRawDiffsToSummarize; - final boolean doPairwise; - boolean descending = true; - - public SummaryReportParams(PrintStream out, - int maxItemsToDisplay, - int maxCountOneItems, - int minSumDiffToShow, - int maxRawDiffsToSummarize, - final boolean doPairwise) { - this.out = out; - this.maxItemsToDisplay = maxItemsToDisplay; - this.maxCountOneItems = maxCountOneItems; - this.minSumDiffToShow = minSumDiffToShow; - this.maxRawDiffsToSummarize = maxRawDiffsToSummarize; - this.doPairwise = doPairwise; - } - - public void setDescending(boolean descending) { - this.descending = descending; - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffNode.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffNode.java deleted file mode 100644 index dde9ca50d..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffNode.java +++ /dev/null @@ -1,249 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - -import com.google.java.contract.Requires; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 7/4/11 - * Time: 12:55 PM - * - * An interface that must be implemented to allow us to calculate differences - * between structured objects - */ -public class DiffNode extends DiffValue { - private Map getElementMap() { - return (Map)super.getValue(); - } - private static Map emptyElements() { return new HashMap(); } - - private DiffNode(Map elements) { - super(elements); - } - - private DiffNode(DiffElement binding, Map elements) { - super(binding, elements); - } - - // --------------------------------------------------------------------------- - // - // constructors - // - // --------------------------------------------------------------------------- - - public static DiffNode rooted(String name) { - return empty(name, DiffElement.ROOT); - } - - public static DiffNode empty(String name, DiffElement parent) { - DiffNode df = new DiffNode(emptyElements()); - DiffElement elt = new DiffElement(name, parent, df); - df.setBinding(elt); - return df; - } - - public static DiffNode empty(String name, DiffValue parent) { - return empty(name, parent.getBinding()); - } - - // --------------------------------------------------------------------------- - // - // accessors - // - // --------------------------------------------------------------------------- - - @Override - public boolean isAtomic() { return false; } - - public Collection getElementNames() { - return getElementMap().keySet(); - } - - public Collection getElements() { - return getElementMap().values(); - } - - private Collection getElements(boolean atomicOnly) { - List elts = new ArrayList(); - for ( DiffElement elt : getElements() ) - if ( (atomicOnly && elt.getValue().isAtomic()) || (! atomicOnly && elt.getValue().isCompound())) - elts.add(elt); - return elts; - } - - public Collection getAtomicElements() { - return getElements(true); - } - - public Collection getCompoundElements() { - return getElements(false); - } - - /** - * Returns the element bound to name, or null if no such binding exists - * @param name - * @return - */ - public DiffElement getElement(String name) { - return getElementMap().get(name); - } - - /** - * Returns true if name is bound in this node - * @param name - * @return - */ - public boolean hasElement(String name) { - return getElement(name) != null; - } - - // --------------------------------------------------------------------------- - // - // add - // - // --------------------------------------------------------------------------- - - @Requires("elt != null") - public void add(DiffElement elt) { - if ( getElementMap().containsKey(elt.getName()) ) - throw new IllegalArgumentException("Attempting to rebind already existing binding: " + elt + " node=" + this); - getElementMap().put(elt.getName(), elt); - } - - @Requires("elt != null") - public void add(DiffValue elt) { - add(elt.getBinding()); - } - - @Requires("elts != null") - public void add(Collection elts) { - for ( DiffElement e : elts ) - add(e); - } - - public void add(String name, Object value) { - add(new DiffElement(name, this.getBinding(), new DiffValue(value))); - } - - public int size() { - int count = 0; - for ( DiffElement value : getElements() ) - count += value.size(); - return count; - } - - // --------------------------------------------------------------------------- - // - // toString - // - // --------------------------------------------------------------------------- - - @Override - public String toString() { - return toString(0); - } - - @Override - public String toString(int offset) { - String off = offset > 0 ? Utils.dupString(' ', offset) : ""; - StringBuilder b = new StringBuilder(); - - b.append("(").append("\n"); - Collection atomicElts = getAtomicElements(); - for ( DiffElement elt : atomicElts ) { - b.append(elt.toString(offset + 2)).append('\n'); - } - - for ( DiffElement elt : getCompoundElements() ) { - b.append(elt.toString(offset + 4)).append('\n'); - } - b.append(off).append(")").append("\n"); - - return b.toString(); - } - - @Override - public String toOneLineString() { - StringBuilder b = new StringBuilder(); - - b.append('('); - List parts = new ArrayList(); - for ( DiffElement elt : getElements() ) - parts.add(elt.toOneLineString()); - b.append(Utils.join(" ", parts)); - b.append(')'); - - return b.toString(); - } - - // -------------------------------------------------------------------------------- - // - // fromString and toOneLineString - // - // -------------------------------------------------------------------------------- - - public static DiffElement fromString(String tree) { - return fromString(tree, DiffElement.ROOT); - } - - /** - * Doesn't support full tree structure parsing - * @param tree - * @param parent - * @return - */ - private static DiffElement fromString(String tree, DiffElement parent) { - // X=(A=A B=B C=(D=D)) - String[] parts = tree.split("=", 2); - if ( parts.length != 2 ) - throw new ReviewedGATKException("Unexpected tree structure: " + tree); - String name = parts[0]; - String value = parts[1]; - - if ( value.length() == 0 ) - throw new ReviewedGATKException("Illegal tree structure: " + value + " at " + tree); - - if ( value.charAt(0) == '(' ) { - if ( ! value.endsWith(")") ) - throw new ReviewedGATKException("Illegal tree structure. Missing ): " + value + " at " + tree); - String subtree = value.substring(1, value.length()-1); - DiffNode rec = DiffNode.empty(name, parent); - String[] subParts = subtree.split(" "); - for ( String subPart : subParts ) { - rec.add(fromString(subPart, rec.getBinding())); - } - return rec.getBinding(); - } else { - return new DiffValue(name, parent, value).getBinding(); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffObjects.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffObjects.java deleted file mode 100644 index c622e24f1..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffObjects.java +++ /dev/null @@ -1,276 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - -import org.broadinstitute.gatk.utils.commandline.Argument; -import org.broadinstitute.gatk.utils.commandline.Input; -import org.broadinstitute.gatk.utils.commandline.Output; -import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.walkers.RodWalker; -import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; -import org.broadinstitute.gatk.utils.help.HelpConstants; - -import java.io.File; -import java.io.PrintStream; -import java.util.List; - -/** - * A generic engine for comparing tree-structured objects - * - *

    - * Compares two record-oriented files, itemizing specific difference between equivalent - * records in the two files. Reports both itemized and summarized differences. - *

    - * - *

    What are the summarized differences and the DiffObjectsWalker?

    - * - *

    - * The GATK contains a summarizing difference engine that compares hierarchical data structures to emit: - *

      - *
    • A list of specific differences between the two data structures. This is similar to saying the value in field A in record 1 in file F differences from the value in field A in record 1 in file G. - *
    • A summarized list of differences ordered by frequency of the difference. This output is similar to saying field A in 50 records in files F and G differed. - *
    - *

    - * - *

    - * The GATK contains a private walker DiffObjects that allows you access to the DiffEngine capabilities on the command line. Simply provide the walker with the master and test files and it will emit summarized differences for you. - *

    - * - *

    Why?

    - * - *

    - * The reason for this system is that it allows you to compare two structured files -- such as BAMs and VCFs -- for common differences among them. This is primarily useful in regression testing or optimization, where you want to ensure that the differences are those that you expect and not any others. - *

    - * - *

    Input

    - *

    - * The DiffObjectsWalker works with BAM or VCF files. - *

    - * - *

    Output

    - *

    - * The DiffEngine system compares to two hierarchical data structures for specific differences in the values of named - * nodes. Suppose I have two trees: - *

    - *     Tree1=(A=1 B=(C=2 D=3))
    - *     Tree2=(A=1 B=(C=3 D=3 E=4))
    - *     Tree3=(A=1 B=(C=4 D=3 E=4))
    - * 
    - *

    - * where every node in the tree is named, or is a raw value (here all leaf values are integers). The DiffEngine - * traverses these data structures by name, identifies equivalent nodes by fully qualified names - * (Tree1.A is distinct from Tree2.A, and determines where their values are equal (Tree1.A=1, Tree2.A=1, so they are). - * These itemized differences are listed as: - *

    - *     Tree1.B.C=2 != Tree2.B.C=3
    - *     Tree1.B.C=2 != Tree3.B.C=4
    - *     Tree2.B.C=3 != Tree3.B.C=4
    - *     Tree1.B.E=MISSING != Tree2.B.E=4
    - * 
    - * - *

    - * This conceptually very similar to the output of the unix command line tool diff. What's nice about DiffEngine though - * is that it computes similarity among the itemized differences and displays the count of differences names - * in the system. In the above example, the field C is not equal three times, while the missing E in Tree1 occurs - * only once. So the summary is: - * - *

    - *     *.B.C : 3
    - *     *.B.E : 1
    - * 
    - * - *

    - * where the * operator indicates that any named field matches. This output is sorted by counts, and provides an - * immediate picture of the commonly occurring differences among the files. - *

    - * Below is a detailed example of two VCF fields that differ because of a bug in the AC, AF, and AN counting routines, - * detected by the integrationtest integration (more below). You can see that in the although there are many specific - * instances of these differences between the two files, the summarized differences provide an immediate picture that - * the AC, AF, and AN fields are the major causes of the differences. - *

    - * - *

    - [testng] path                                                             count
    - [testng] *.*.*.AC                                                         6
    - [testng] *.*.*.AF                                                         6
    - [testng] *.*.*.AN                                                         6
    - [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AC  1
    - [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AF  1
    - [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AN  1
    - [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AC  1
    - [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AF  1
    - [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AN  1
    - [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AC  1
    - [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AF  1
    - [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AN  1
    - [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000598.AC  1
    - 
    - * - *

    Caveat

    - *

    Because this is a walker, it requires that you pass a reference file. However the reference is not actually used, so it does not matter what you pass as reference.

    - * - * - * @author Mark DePristo - * @since 7/4/11 - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) -public class DiffObjects extends RodWalker { - /** - * Writes out a file of the DiffEngine format: - * - * See http://www.broadinstitute.org/gatk/guide/article?id=1299 for details. - */ - @Output(doc="File to which results should be written") - protected PrintStream out; - - /** - * The master file against which we will compare test. This is one of the two required - * files to do the comparison. Conceptually master is the original file contained the expected - * results, but this doesn't currently have an impact on the calculations, but might in the future. - */ - @Input(fullName="master", shortName="m", doc="Master file: expected results", required=true) - File masterFile; - - /** - * The test file against which we will compare to the master. This is one of the two required - * files to do the comparison. Conceptually test is the derived file from master, but this - * doesn't currently have an impact on the calculations, but might in the future. - */ - @Input(fullName="test", shortName="t", doc="Test file: new results to compare to the master file", required=true) - File testFile; - - /** - * The engine will read at most this number of objects from each of master and test files. This reduces - * the memory requirements for DiffObjects but does limit you to comparing at most this number of objects - */ - @Argument(fullName="maxObjectsToRead", shortName="motr", doc="Max. number of objects to read from the files. -1 [default] means unlimited", required=false) - int MAX_OBJECTS_TO_READ = -1; - - @Argument(fullName="maxRawDiffsToSummarize", shortName="maxRawDiffsToSummarize", doc="Max. number of differences to include in the summary. -1 [default] means unlimited", required=false) - int maxRawDiffsToSummary = -1; - - @Argument(fullName="doPairwise", shortName="doPairwise", doc="If provided, we will compute the minimum pairwise differences to summary, which can be extremely expensive", required=false) - boolean doPairwise = false; - - /** - * The max number of differences to display when summarizing. For example, if there are 10M differences, but - * maxDiffs is 10, then the comparison aborts after first ten summarized differences are shown. Note that - * the system shows differences sorted by frequency, so these 10 would be the most common between the two files. - * A value of 0 means show all possible differences. - */ - @Argument(fullName="maxDiffs", shortName="M", doc="Max. number of diffs to process", required=false) - int MAX_DIFFS = 0; - - /** - * The maximum number of singleton (occurs exactly once between the two files) to display when writing out - * the summary. Only applies if maxDiffs hasn't been exceeded. For example, if maxDiffs is 10 and maxCount1Diffs - * is 2 and there are 20 diffs with count > 1, then only 10 are shown, all of which have count above 1. - */ - @Argument(fullName="maxCount1Diffs", shortName="M1", doc="Max. number of diffs occuring exactly once in the file to process", required=false) - int MAX_COUNT1_DIFFS = 0; - - /** - * Only differences that occur more than minCountForDiff are displayed. For example, if minCountForDiff is 10, then - * a difference must occur at least 10 times between the two files to be shown. - */ - @Argument(fullName="minCountForDiff", shortName="MCFD", doc="Min number of observations for a records to display", required=false) - int minCountForDiff = 1; - - /** - * If provided, the system will write out the summarized, individual differences. May lead to enormous outputs, - * depending on how many differences are found. Note these are not sorted in any way, so if you have 10M - * common differences in the files, you will see 10M records, whereas the final summarize will just list the - * difference and its count of 10M. - */ - @Argument(fullName="showItemizedDifferences", shortName="SID", doc="Should we enumerate all differences between the files?", required=false) - boolean showItemizedDifferences = false; - - @Argument(fullName="iterations", doc="Number of iterations to perform, should be 1 unless you are doing memory testing", required=false) - int iterations = 1; - - DiffEngine diffEngine; - - @Override - public void initialize() { - this.diffEngine = new DiffEngine(); - } - - @Override - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - return 0; - } - - @Override - public Integer reduceInit() { - return 0; - } - - @Override - public Integer reduce(Integer counter, Integer sum) { - return counter + sum; - } - - @Override - public void onTraversalDone(Integer sum) { - if ( iterations > 1 ) { - for ( int i = 0; i < iterations; i++ ) { - DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, 20, 10, 0, -1, false); - boolean success = DiffEngine.simpleDiffFiles(masterFile, testFile, MAX_OBJECTS_TO_READ, params); - logger.info("Iteration " + i + " success " + success); - } - } else { - //out.printf("Reading master file %s%n", masterFile); - DiffElement master = diffEngine.createDiffableFromFile(masterFile, MAX_OBJECTS_TO_READ); - logger.info(String.format("Read %d objects", master.size())); - //out.printf("Reading test file %s%n", testFile); - DiffElement test = diffEngine.createDiffableFromFile(testFile, MAX_OBJECTS_TO_READ); - logger.info(String.format("Read %d objects", test.size())); - -// out.printf("Master diff objects%n"); -// out.println(master.toString()); -// out.printf("Test diff objects%n"); -// out.println(test.toString()); - - List diffs = diffEngine.diff(master, test); - logger.info(String.format("Done computing diff with %d differences found", diffs.size())); - if ( showItemizedDifferences ) { - out.printf("Itemized results%n"); - for ( Difference diff : diffs ) - out.printf("DIFF: %s%n", diff.toString()); - } - - DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, - MAX_DIFFS, MAX_COUNT1_DIFFS, minCountForDiff, - maxRawDiffsToSummary, doPairwise); - params.setDescending(false); - diffEngine.reportSummarizedDifferences(diffs, params); - logger.info(String.format("Done summarizing differences")); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffValue.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffValue.java deleted file mode 100644 index acec38356..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffValue.java +++ /dev/null @@ -1,90 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 7/4/11 - * Time: 12:55 PM - * - * An interface that must be implemented to allow us to calculate differences - * between structured objects - */ -public class DiffValue { - private DiffElement binding = null; - final private Object value; - - public DiffValue(Object value) { - this.value = value; - } - - public DiffValue(DiffElement binding, Object value) { - this.binding = binding; - this.value = value; - } - - public DiffValue(DiffValue parent, Object value) { - this(parent.getBinding(), value); - } - - public DiffValue(String name, DiffElement parent, Object value) { - this.binding = new DiffElement(name, parent, this); - this.value = value; - } - - public DiffValue(String name, DiffValue parent, Object value) { - this(name, parent.getBinding(), value); - } - - public DiffElement getBinding() { - return binding; - } - - protected void setBinding(DiffElement binding) { - this.binding = binding; - } - - public Object getValue() { - return value; - } - - public String toString() { - return getValue().toString(); - } - - public String toString(int offset) { - return toString(); - } - - public String toOneLineString() { - return getValue().toString(); - } - - public boolean isAtomic() { return true; } - public boolean isCompound() { return ! isAtomic(); } - public int size() { return 1; } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffableReader.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffableReader.java deleted file mode 100644 index 903a073e0..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffableReader.java +++ /dev/null @@ -1,66 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; - -import java.io.File; - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 7/4/11 - * Time: 1:09 PM - * - * Interface for readers creating diffable objects from a file - */ -public interface DiffableReader { - @Ensures("result != null") - /** - * Return the name of this DiffableReader type. For example, the VCF reader returns 'VCF' and the - * bam reader 'BAM' - */ - public String getName(); - - @Ensures("result != null") - @Requires("file != null") - /** - * Read up to maxElementsToRead DiffElements from file, and return them. - */ - public DiffElement readFromFile(File file, int maxElementsToRead); - - /** - * Return true if the file can be read into DiffElement objects with this reader. This should - * be uniquely true/false for all readers, as the system will use the first reader that can read the - * file. This routine should never throw an exception. The VCF reader, for example, looks at the - * first line of the file for the ##format=VCF4.1 header, and the BAM reader for the BAM_MAGIC value - * @param file - * @return - */ - @Requires("file != null") - public boolean canRead(File file); -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/Difference.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/Difference.java deleted file mode 100644 index c8794a703..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/Difference.java +++ /dev/null @@ -1,137 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - -public class Difference implements Comparable { - final String path; // X.Y.Z - final String[] parts; - int count = 1; - DiffElement master = null , test = null; - - public Difference(String path) { - this.path = path; - this.parts = DiffEngine.diffNameToPath(path); - } - - public Difference(DiffElement master, DiffElement test) { - this(createPath(master, test), master, test); - } - - public Difference(String path, DiffElement master, DiffElement test) { - this(path); - this.master = master; - this.test = test; - } - - public String[] getParts() { - return parts; - } - - public void incCount() { count++; } - - public int getCount() { - return count; - } - - public void setCount(int count) { - this.count = count; - } - - /** - * The fully qualified path object A.B.C etc - * @return - */ - public String getPath() { - return path; - } - - /** - * @return the length of the parts of this summary - */ - public int length() { - return this.parts.length; - } - - /** - * Returns true if the string parts matches this summary. Matches are - * must be equal() everywhere where this summary isn't *. - * @param otherParts - * @return - */ - public boolean matches(String[] otherParts) { - if ( otherParts.length != length() ) - return false; - - // TODO optimization: can start at right most non-star element - for ( int i = 0; i < length(); i++ ) { - String part = parts[i]; - if ( ! part.equals("*") && ! part.equals(otherParts[i]) ) - return false; - } - - return true; - } - - @Override - public String toString() { - return String.format("%s:%d:%s", getPath(), getCount(), valueDiffString()); - } - - @Override - public int compareTo(Difference other) { - // sort first highest to lowest count, then by lowest to highest path - int countCmp = Integer.valueOf(count).compareTo(other.count); - return countCmp != 0 ? -1 * countCmp : path.compareTo(other.path); - } - - public String valueDiffString() { - if ( hasSpecificDifference() ) { - return String.format("%s!=%s", getOneLineString(master), getOneLineString(test)); - } else { - return "N/A"; - } - } - - private static String createPath(DiffElement master, DiffElement test) { - return (master == null ? test : master).fullyQualifiedName(); - } - - private static String getOneLineString(DiffElement elt) { - return elt == null ? "MISSING" : elt.getValue().toOneLineString(); - } - - public boolean hasSpecificDifference() { - return master != null || test != null; - } - - public DiffElement getMaster() { - return master; - } - - public DiffElement getTest() { - return test; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/GATKReportDiffableReader.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/GATKReportDiffableReader.java deleted file mode 100644 index 4a78448b6..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/GATKReportDiffableReader.java +++ /dev/null @@ -1,104 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - -import org.broadinstitute.gatk.engine.report.GATKReport; -import org.broadinstitute.gatk.engine.report.GATKReportColumn; -import org.broadinstitute.gatk.engine.report.GATKReportTable; - -import java.io.File; -import java.io.FileReader; -import java.io.IOException; - - -/** - * Class implementing diffnode reader for GATKReports - */ - -// TODO Version check to be added at the report level - -public class GATKReportDiffableReader implements DiffableReader { - @Override - public String getName() { - return "GATKReport"; - } - - @Override - public DiffElement readFromFile(File file, int maxElementsToRead) { - DiffNode root = DiffNode.rooted(file.getName()); - try { - // one line reads the whole thing into memory - GATKReport report = new GATKReport(file); - - for (GATKReportTable table : report.getTables()) { - root.add(tableToNode(table, root)); - } - - return root.getBinding(); - } catch (Exception e) { - return null; - } - } - - private DiffNode tableToNode(GATKReportTable table, DiffNode root) { - DiffNode tableRoot = DiffNode.empty(table.getTableName(), root); - - tableRoot.add("Description", table.getTableDescription()); - tableRoot.add("NumberOfRows", table.getNumRows()); - - for ( GATKReportColumn column : table.getColumnInfo() ) { - DiffNode columnRoot = DiffNode.empty(column.getColumnName(), tableRoot); - - columnRoot.add("Width", column.getColumnFormat().getWidth()); - // NOTE: as the values are trimmed during parsing left/right alignment is not currently preserved - columnRoot.add("Displayable", true); - - for ( int i = 0; i < table.getNumRows(); i++ ) { - String name = column.getColumnName() + (i+1); - columnRoot.add(name, table.get(i, column.getColumnName()).toString()); - } - - tableRoot.add(columnRoot); - } - - return tableRoot; - } - - @Override - public boolean canRead(File file) { - try { - final String HEADER = GATKReport.GATKREPORT_HEADER_PREFIX; - final char[] buff = new char[HEADER.length()]; - final FileReader FR = new FileReader(file); - FR.read(buff, 0, HEADER.length()); - FR.close(); - String firstLine = new String(buff); - return firstLine.startsWith(HEADER); - } catch (IOException e) { - return false; - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/VCFDiffableReader.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/VCFDiffableReader.java deleted file mode 100644 index 23b213e91..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/VCFDiffableReader.java +++ /dev/null @@ -1,145 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - -import org.apache.log4j.Logger; -import htsjdk.tribble.AbstractFeatureReader; -import htsjdk.tribble.FeatureReader; -import org.broadinstitute.gatk.utils.Utils; -import htsjdk.variant.vcf.*; -import htsjdk.variant.variantcontext.Genotype; -import htsjdk.variant.variantcontext.VariantContext; - -import java.io.*; -import java.util.Iterator; -import java.util.Map; - - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 7/4/11 - * Time: 1:09 PM - * - * Class implementing diffnode reader for VCF - */ -public class VCFDiffableReader implements DiffableReader { - private static Logger logger = Logger.getLogger(VCFDiffableReader.class); - - @Override - public String getName() { return "VCF"; } - - @Override - public DiffElement readFromFile(File file, int maxElementsToRead) { - DiffNode root = DiffNode.rooted(file.getName()); - try { - // read the version line from the file - BufferedReader br = new BufferedReader(new FileReader(file)); - final String version = br.readLine(); - root.add("VERSION", version); - br.close(); - - final VCFCodec vcfCodec = new VCFCodec(); - vcfCodec.disableOnTheFlyModifications(); // must be read as state is stored in reader itself - - FeatureReader reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), vcfCodec, false); - VCFHeader header = (VCFHeader)reader.getHeader(); - for ( VCFHeaderLine headerLine : header.getMetaDataInInputOrder() ) { - String key = headerLine.getKey(); - if ( headerLine instanceof VCFIDHeaderLine) - key += "_" + ((VCFIDHeaderLine) headerLine).getID(); - if ( root.hasElement(key) ) - logger.warn("Skipping duplicate header line: file=" + file + " line=" + headerLine.toString()); - else - root.add(key, headerLine.toString()); - } - - int count = 0, nRecordsAtPos = 1; - String prevName = ""; - Iterator it = reader.iterator(); - while ( it.hasNext() ) { - VariantContext vc = it.next(); - String name = vc.getChr() + ":" + vc.getStart(); - if ( name.equals(prevName) ) { - name += "_" + ++nRecordsAtPos; - } else { - prevName = name; - } - DiffNode vcRoot = DiffNode.empty(name, root); - - // add fields - vcRoot.add("CHROM", vc.getChr()); - vcRoot.add("POS", vc.getStart()); - vcRoot.add("ID", vc.getID()); - vcRoot.add("REF", vc.getReference()); - vcRoot.add("ALT", vc.getAlternateAlleles()); - vcRoot.add("QUAL", vc.hasLog10PError() ? vc.getLog10PError() * -10 : VCFConstants.MISSING_VALUE_v4); - vcRoot.add("FILTER", ! vc.filtersWereApplied() // needs null to differentiate between PASS and . - ? VCFConstants.MISSING_VALUE_v4 - : ( vc.getFilters().isEmpty() ? VCFConstants.PASSES_FILTERS_v4 : vc.getFilters()) ); - - // add info fields - for (Map.Entry attribute : vc.getAttributes().entrySet()) { - if ( ! attribute.getKey().startsWith("_") ) - vcRoot.add(attribute.getKey(), attribute.getValue()); - } - - for (Genotype g : vc.getGenotypes() ) { - DiffNode gRoot = DiffNode.empty(g.getSampleName(), vcRoot); - gRoot.add("GT", g.getGenotypeString()); - if ( g.hasGQ() ) gRoot.add("GQ", g.getGQ() ); - if ( g.hasDP() ) gRoot.add("DP", g.getDP() ); - if ( g.hasAD() ) gRoot.add("AD", Utils.join(",", g.getAD())); - if ( g.hasPL() ) gRoot.add("PL", Utils.join(",", g.getPL())); - if ( g.getFilters() != null ) gRoot.add("FT", g.getFilters()); - - for (Map.Entry attribute : g.getExtendedAttributes().entrySet()) { - if ( ! attribute.getKey().startsWith("_") ) - gRoot.add(attribute.getKey(), attribute.getValue()); - } - - vcRoot.add(gRoot); - } - - root.add(vcRoot); - count += vcRoot.size(); - if ( count > maxElementsToRead && maxElementsToRead != -1) - break; - } - - reader.close(); - } catch ( IOException e ) { - return null; - } - - return root.getBinding(); - } - - @Override - public boolean canRead(File file) { - return AbstractVCFCodec.canDecodeFile(file.getPath(), VCFCodec.VCF4_MAGIC_HEADER); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java index 43403ab79..391b0202f 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java @@ -40,7 +40,7 @@ import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; import org.broadinstitute.gatk.utils.text.XReadLines; import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import htsjdk.variant.bcf2.BCF2Codec; import org.broadinstitute.gatk.utils.collections.Pair; import htsjdk.variant.vcf.VCFCodec; @@ -56,24 +56,23 @@ import java.util.*; /** * - * Concatenates VCF files of non-overlapped genome intervals, all with the same set of samples + * Concatenate VCF files of non-overlapping genome intervals, all with the same set of samples * *

    * The main purpose of this tool is to speed up the gather function when using scatter-gather parallelization. * This tool concatenates the scattered output VCF files. It assumes that: - * - All the input VCFs (or BCFs) contain the same samples in the same order. - * - The variants in each input file are from non-overlapping (scattered) intervals. - * - * When the input files are already sorted based on the intervals start positions, use -assumeSorted. - * - * Note: Currently the tool is more efficient when working with VCFs; we will work to make it as efficient for BCFs. - * + *

      + *
    • All the input VCFs (or BCFs) contain the same samples in the same order.
    • + *
    • The variants in each input file are from non-overlapping (scattered) intervals.
    • + *
    *

    + *

    When the input files are already sorted based on the intervals start positions, use -assumeSorted.

    * *

    Input

    *

    - * One or more variant sets to combine. They should be of non-overlapping genome intervals and with the same samples (in the same order). - * If the files are ordered according to the appearance of intervals in the ref genome, then one can use the -assumeSorted flag. + * Two or more variant sets to combine. They should be of non-overlapping genome intervals and with the same + * samples (sorted in the same order). If the files are ordered according to the appearance of intervals in the ref + * genome, then one can use the -assumeSorted flag. *

    * *

    Output

    @@ -86,16 +85,19 @@ import java.util.*; * invoke it is a little different from other GATK tools (see example below), and it does not accept any of the * classic "CommandLineGATK" arguments.

    * - *

    Example

    + *

    Usage example

    *
      * java -cp GenomeAnalysisTK.jar org.broadinstitute.gatk.tools.CatVariants \
    - *    -R ref.fasta \
    + *    -R reference.fasta \
      *    -V input1.vcf \
      *    -V input2.vcf \
      *    -out output.vcf \
      *    -assumeSorted
      * 
    * + *

    Caveat

    + *

    Currently the tool is more efficient when working with VCFs than with BCFs.

    + * * @author Ami Levy Moonshine * @since Jan 2012 */ @@ -147,37 +149,30 @@ public class CatVariants extends CommandLineProgram { INVALID } - private FileType fileExtensionCheck(File inFile, File outFile) { + private FileType fileExtensionCheck(File inFile, FileType previousFileType) { final String inFileName = inFile.toString().toLowerCase(); - final String outFileName = outFile.toString().toLowerCase(); - - FileType inFileType = FileType.INVALID; if (inFileName.endsWith(".vcf")) { - inFileType = FileType.VCF; - if (outFileName.endsWith(".vcf")) - return inFileType; + if (previousFileType == FileType.VCF || previousFileType == null) { + return FileType.VCF; + } } if (inFileName.endsWith(".bcf")) { - inFileType = FileType.BCF; - if (outFileName.endsWith(".bcf")) - return inFileType; + if (previousFileType == FileType.BCF || previousFileType == null) { + return FileType.BCF; + } } for (String extension : AbstractFeatureReader.BLOCK_COMPRESSED_EXTENSIONS) { if (inFileName.endsWith(".vcf" + extension)) { - inFileType = FileType.BLOCK_COMPRESSED_VCF; - if (outFileName.endsWith(".vcf" + extension)) - return inFileType; + if (previousFileType == FileType.BLOCK_COMPRESSED_VCF || previousFileType == null) { + return FileType.BLOCK_COMPRESSED_VCF; + } } } - if (inFileType == FileType.INVALID) - System.err.println(String.format("File extension for input file %s is not valid for CatVariants", inFile)); - else - System.err.println(String.format("File extension mismatch between input %s and output %s", inFile, outFile)); - + System.err.println(String.format("File extension for input file %s is not valid for CatVariants", inFile)); printUsage(); return FileType.INVALID; } @@ -241,10 +236,10 @@ public class CatVariants extends CommandLineProgram { else priorityQueue = new PriorityQueue<>(10000, positionComparator); - FileType fileType = FileType.INVALID; + FileType fileType = null; for (File file : variant) { // if it returns a valid type, it will be the same for all files - fileType = fileExtensionCheck(file, outputFile); + fileType = fileExtensionCheck(file, fileType); if (fileType == FileType.INVALID) return 1; @@ -266,12 +261,11 @@ public class CatVariants extends CommandLineProgram { reader.close(); priorityQueue.add(new Pair<>(firstPosition,file)); } - } FileOutputStream outputStream = new FileOutputStream(outputFile); EnumSet options = EnumSet.of(Options.INDEX_ON_THE_FLY); - final IndexCreator idxCreator = GATKVCFUtils.getIndexCreator(variant_index_type, variant_index_parameter, outputFile, ref.getSequenceDictionary()); + IndexCreator idxCreator = GATKVCFUtils.makeIndexCreator(variant_index_type, variant_index_parameter, outputFile, ref.getSequenceDictionary()); final VariantContextWriter outputWriter = VariantContextWriterFactory.create(outputFile, outputStream, ref.getSequenceDictionary(), idxCreator, options); boolean firstFile = true; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/ListAnnotations.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/ListAnnotations.java index 72d30defd..4bc91d6e3 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/ListAnnotations.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/ListAnnotations.java @@ -27,9 +27,9 @@ package org.broadinstitute.gatk.tools; import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotationHelpUtils; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.help.HelpUtils; /** * Utility program to print a list of available annotations @@ -66,7 +66,7 @@ public class ListAnnotations extends CommandLineProgram { @Override protected int execute() throws Exception { - HelpUtils.listAnnotations(); + AnnotationHelpUtils.listAnnotations(); return 0; } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalance.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalance.java index 9127b5ee2..5de5d6656 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalance.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalance.java @@ -30,16 +30,15 @@ import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.GenotypesContext; import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.vcf.VCFHeaderLineType; -import htsjdk.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import java.util.Arrays; import java.util.HashMap; @@ -144,15 +143,15 @@ public class AlleleBalance extends InfoFieldAnnotation { Map map = new HashMap<>(); if ( weightHet > 0.0 ) { - map.put("ABHet",ratioHet/weightHet); + map.put(GATKVCFConstants.ALLELE_BALANCE_HET_KEY,ratioHet/weightHet); } if ( weightHom > 0.0 ) { - map.put("ABHom",ratioHom/weightHom); + map.put(GATKVCFConstants.ALLELE_BALANCE_HOM_KEY,ratioHom/weightHom); } if ( overallNonDiploid > 0.0 ) { - map.put("OND",overallNonDiploid); + map.put(GATKVCFConstants.NON_DIPLOID_RATIO_KEY,overallNonDiploid); } return map; } @@ -210,9 +209,10 @@ public class AlleleBalance extends InfoFieldAnnotation { } - public List getKeyNames() { return Arrays.asList("ABHet","ABHom","OND"); } - - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("ABHet", 1, VCFHeaderLineType.Float, "Allele Balance for heterozygous calls (ref/(ref+alt))"), - new VCFInfoHeaderLine("ABHom", 1, VCFHeaderLineType.Float, "Allele Balance for homozygous calls (A/(A+O)) where A is the allele (ref or alt) and O is anything other"), - new VCFInfoHeaderLine("OND", 1, VCFHeaderLineType.Float, "Overall non-diploid ratio (alleles/(alleles+non-alleles))")); } + @Override + public List getKeyNames() { + return Arrays.asList(GATKVCFConstants.ALLELE_BALANCE_HET_KEY, + GATKVCFConstants.ALLELE_BALANCE_HOM_KEY, + GATKVCFConstants.NON_DIPLOID_RATIO_KEY); + } } \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java index 9f5ee9c55..1c99fa8fc 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java @@ -32,9 +32,9 @@ import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFFormatHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineType; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation; @@ -43,6 +43,8 @@ import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.Arrays; import java.util.HashMap; @@ -63,7 +65,7 @@ import java.util.Set; *
      *
    • This annotation will only work properly for biallelic heterozygous calls.
    • *
    • This annotation cannot currently be calculated for indels.
    • - *
    • tThe reasoning underlying this annotation only applies to germline variants in DNA sequencing data. In somatic/cancer analysis, divergent ratios are expected due to tumor heterogeneity. In RNAseq analysis, divergent ratios may indicate differential allele expression.
    • + *
    • The reasoning underlying this annotation only applies to germline variants in DNA sequencing data. In somatic/cancer analysis, divergent ratios are expected due to tumor heterogeneity. In RNAseq analysis, divergent ratios may indicate differential allele expression.
    • *
    • As stated above, this annotation is experimental and should be interpreted with caution as we cannot guarantee that it is appropriate. Basically, use it at your own risk.
    • *
    *

    Related annotations

    @@ -92,7 +94,7 @@ public class AlleleBalanceBySample extends GenotypeAnnotation implements Experim // and isBiallelic() while ignoring the allele boolean biallelicSNP = vc.isSNP() && vc.isBiallelic(); - if(vc.hasAllele(GVCF_NONREF)){ + if(vc.hasAllele(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE)){ // If we have the GVCF allele, then the SNP is biallelic // iff there are 3 alleles and both the reference and first alt // allele are length 1. @@ -118,8 +120,6 @@ public class AlleleBalanceBySample extends GenotypeAnnotation implements Experim gb.attribute(getKeyNames().get(0), Double.valueOf(String.format("%.2f", ratio))); } - private static final Allele GVCF_NONREF = Allele.create("", false); - private Double annotateWithPileup(final AlignmentContext stratifiedContext, final VariantContext vc) { final HashMap alleleCounts = new HashMap<>(); @@ -175,7 +175,7 @@ public class AlleleBalanceBySample extends GenotypeAnnotation implements Experim } - public List getKeyNames() { return Arrays.asList("AB"); } + public List getKeyNames() { return Arrays.asList(GATKVCFConstants.ALLELE_BALANCE_KEY); } - public List getDescriptions() { return Arrays.asList(new VCFFormatHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Allele balance for each het genotype")); } + public List getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getFormatLine(getKeyNames().get(0))); } } \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCounts.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCounts.java index 44579f9ba..a01c945ac 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCounts.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCounts.java @@ -25,16 +25,17 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.BaseUtils; -import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.Arrays; import java.util.HashMap; @@ -80,12 +81,12 @@ import java.util.Map; counts[index]++; } } - Map map = new HashMap(); + Map map = new HashMap<>(); map.put(getKeyNames().get(0), counts); return map; } - public List getKeyNames() { return Arrays.asList("BaseCounts"); } + public List getKeyNames() { return Arrays.asList(GATKVCFConstants.BASE_COUNTS_KEY); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("BaseCounts", 4, VCFHeaderLineType.Integer, "Counts of each base")); } + public List getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); } } \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ChromosomeCountConstants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ChromosomeCountConstants.java deleted file mode 100644 index 67fc0a406..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ChromosomeCountConstants.java +++ /dev/null @@ -1,44 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.tools.walkers.annotator; - -import htsjdk.variant.vcf.VCFConstants; -import htsjdk.variant.vcf.VCFInfoHeaderLine; -import htsjdk.variant.vcf.VCFStandardHeaderLines; - - -/** - * Keys and descriptions for the common chromosome count annotations - */ -public class ChromosomeCountConstants { - - public static final String[] keyNames = { VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_FREQUENCY_KEY }; - - public static final VCFInfoHeaderLine[] descriptions = { - VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY), - VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY), - VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY) }; -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LowMQ.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LowMQ.java index 05054a3f3..1d4b7a002 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LowMQ.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LowMQ.java @@ -25,16 +25,17 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; -import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.gatk.utils.pileup.PileupElement; import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import java.util.Arrays; import java.util.HashMap; @@ -48,8 +49,7 @@ import java.util.Map; *

    This annotation tells you what fraction of reads have a mapping quality of less than the given threshold of 10 (including 0). Note that certain tools may impose a different minimum mapping quality threshold. For example, HaplotypeCaller excludes reads with MAPQ<20.

    * *

    Calculation

    - *

    $$ LowMQ = \frac{# reads with MAPQ=0 + # reads with MAPQ<10}{total # reads} $$ - *

    + * $$ LowMQ = \frac{# reads with MAPQ=0 + # reads with MAPQ<10}{total # reads} $$ * *

    Related annotations

    *