diff --git a/ant-bridge.sh b/ant-bridge.sh
index a2f686586..af94fa435 100755
--- a/ant-bridge.sh
+++ b/ant-bridge.sh
@@ -1,6 +1,7 @@
#!/bin/sh
-mvn_args="verify"
+default_args="verify '-Ddisable.shadepackage'"
+mvn_args="${default_args}"
mvn_properties=
mvn_clean=
unknown_args=
@@ -44,22 +45,23 @@ for arg in "${@}" ; do
fi
else
- if [[ "${arg}" != "dist" && "${mvn_args}" != "" && "${mvn_args}" != "verify" ]] ; then
+ if [[ "${arg}" != "dist" && "${mvn_args}" != "" && "${mvn_args}" != "${default_args}" ]] ; then
echo "Sorry, this script does not currently support mixing targets." >&2
exit 1
elif [[ "${arg}" == "dist" ]] ; then
- mvn_args="verify"
+ mvn_args="${default_args}"
elif [[ "${arg}" == "gatk" ]] ; then
- mvn_args="verify '-P!queue'"
+ mvn_args="${default_args} '-P!queue'"
elif [[ "${arg}" == "test.compile" ]] ; then
mvn_args="test-compile"
elif [[ "${arg}" == "gatkdocs" ]] ; then
local_repo="sitetemprepo"
- mvn_args="install -Dmaven.repo.local=${local_repo} -Ddisable.queue && mvn site -Dmaven.repo.local=${local_repo} -Ddisable.queue"
+ mvn_args="install -Dmaven.repo.local=${local_repo} '-P!queue' && mvn site -Dmaven.repo.local=${local_repo} '-P!queue'"
+ mvn_pkg_args=
elif [[ "${arg}" == "package.gatk.full" ]] ; then
mvn_args="package '-P!private,!queue'"
@@ -75,11 +77,11 @@ for arg in "${@}" ; do
# elif [[ "${arg}" == "release.gatk.full" ]] ; then
# mvn_args="package '-P!private,!queue'"
-# post_script=" && private/src/main/scripts/shell/copy_release.sh public/gatk-package/target/GenomeAnalysisTK-*.tar.bz2"
+# post_script=" && private/src/main/scripts/shell/copy_release.sh protected/gatk-package-distribution/target/GenomeAnalysisTK-*.tar.bz2"
# elif [[ "${arg}" == "release.queue.full" ]] ; then
# mvn_args="package '-P!private'"
-# post_script=" && private/src/main/scripts/shell/copy_release.sh public/queue-package/target/Queue-*.tar.bz2"
+# post_script=" && private/src/main/scripts/shell/copy_release.sh protected/gatk-queue-package-distribution/target/Queue-*.tar.bz2"
elif [[ "${arg}" == "build-picard-private" ]] ; then
mvn_args="mvn install -f private/picard-maven/pom.xml"
@@ -113,7 +115,7 @@ for arg in "${@}" ; do
mvn_args="${mvn_args} -Dgatk.queuetests.run=true"
elif [[ "${arg}" == "committests" ]] ; then
- mvn_args="verify -Dgatk.committests.skipped=false"
+ mvn_args="${default_args} -Dgatk.committests.skipped=false"
elif [[ "${arg}" == "test" ]] ; then
mvn_args="test -Dgatk.unittests.skipped=false"
@@ -122,19 +124,19 @@ for arg in "${@}" ; do
mvn_args="test -Dgatk.unittests.skipped=false"
elif [[ "${arg}" == "integrationtest" ]] ; then
- mvn_args="verify -Dgatk.integrationtests.skipped=false"
+ mvn_args="${default_args} -Dgatk.integrationtests.skipped=false"
elif [[ "${arg}" == "largescaletest" ]] ; then
- mvn_args="verify -Dgatk.largescaletests.skipped=false"
+ mvn_args="${default_args} -Dgatk.largescaletests.skipped=false"
elif [[ "${arg}" == "knowledgebasetest" ]] ; then
- mvn_args="verify -Dgatk.knowledgebasetests.skipped=false"
+ mvn_args="${default_args} -Dgatk.knowledgebasetests.skipped=false"
elif [[ "${arg}" == "queuetest" ]] ; then
- mvn_args="verify -Dgatk.queuetests.skipped=false"
+ mvn_args="${default_args} -Dgatk.queuetests.skipped=false"
elif [[ "${arg}" == "queuetestrun" ]] ; then
- mvn_args="verify -Dgatk.queuetests.skipped=false -Dgatk.queuetests.run=true"
+ mvn_args="${default_args} -Dgatk.queuetests.skipped=false -Dgatk.queuetests.run=true"
elif [[ "${arg}" == "fasttest" ]] ; then
mvn_args="verify -Dgatk.committests.skipped=false -pl private/gatk-tools-private -am -Dresource.bundle.skip=true"
diff --git a/pom.xml b/pom.xml
index 8488cf87f..95440ec8a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -13,7 +13,7 @@
org.broadinstitute.gatkgatk-root
- 3.3
+ 3.4-SNAPSHOTpublic/gatk-root
@@ -32,11 +32,15 @@
false-build-timestamp "${maven.build.timestamp}"
+ ${gatk.basedir}/public/src/main/scripts/shell
+ ${gatk.basedir}/public/src/main/assemblypackage
+ prepare-package
+ packagegenerate-resourcesprocess-resourcesprocess-test-resources
@@ -65,6 +69,16 @@
${gatk.serialcommittests.skipped}truetrue
+
+
+ ${gatk.basedir}/target/executable
+
+ ${gatk.basedir}/target/package
+
+ ${gatk.basedir}/target
@@ -138,9 +152,19 @@
${resource.bundle.path}
+
+ executable-jar-lib
+
+ copy-dependencies
+
+ none
+
+ ${gatk.executable.directory}/lib
+ runtime
+
+
-
org.apache.maven.pluginsmaven-resources-plugin
@@ -159,25 +183,6 @@
${gatk.process-test-resources.phase}
-
- copy-resource-bundle-log4j
-
- copy-resources
-
- none
-
-
- ${project.reporting.outputDirectory}/apidocs
-
-
- ${gatk.basedir}/gatk-utils/src/main/config/org/broadinstitute/gatk/utils/help
-
-
-
-
@@ -198,8 +203,7 @@
${project.build.outputDirectory}${project.groupId}
-
- gatk-tools-public
+ ${project.artifactId}${project.version}2g
@@ -319,10 +323,40 @@
org.apache.maven.pluginsmaven-jar-plugin
+
+ executable-jar
+
+ jar
+
+ none
+
+ ${project.build.outputDirectory}/ignored_by_executable_jar
+ ${gatk.executable.directory}
+ ${gatk.binary-dist.name}
+
+
+ ${app.main.class}
+ true
+ lib/
+
+
+
+ default-jar${gatk.jar.phase}
+
+
+ unshaded-default-jar
+
+ jar
+
+ none
+ test-jar
@@ -341,13 +375,14 @@
maven-shade-plugin
- gatk-executable
+ package-jarshadenonetrue
+ falseorg.broadinstitute.gatk:gsalib:tar.gz:*
@@ -405,7 +440,7 @@
none
- src/main/assembly/binary-dist.xml
+ ${gatk.assembly.directory}/binary-dist.xml
@@ -437,7 +472,7 @@
- link-binary-jar
+ link-executable-jarlink
@@ -445,7 +480,26 @@
- ${gatk.basedir}/target/${gatk.binary-dist.name}.${project.packaging}
+ ${gatk.shortcut.directory}/${gatk.binary-dist.name}.${project.packaging}
+ ${gatk.executable.directory}/${gatk.binary-dist.name}.${project.packaging}
+
+
+
+
+
+ link-package-jar
+
+ link
+
+ none
+
+
+
+ ${gatk.package.directory}/${gatk.binary-dist.name}.${project.packaging}
+ ${project.build.directory}/${project.build.finalName}.${project.packaging}
+
+
+ ${gatk.shortcut.directory}/${gatk.binary-dist.name}.${project.packaging}${project.build.directory}/${project.build.finalName}.${project.packaging}
@@ -624,6 +678,21 @@
org.codehaus.mojoexec-maven-plugin
+
+
+ check-utils-engine-tools
+
+ exec
+
+ process-sources
+ false
+
+ ${gatk.shell.directory}/check_utils_engine_tools.sh
+
+ false
- org.broadinstitute.gatk.utils.help.GATKDoclet
+ org.broadinstitute.gatk.tools.walkers.help.WalkerDoclet${project.groupId}gatk-package-distribution
@@ -733,6 +802,26 @@
+
+
+
+ fast
+
+
+ disable.shadepackage
+
+
+
+ none
+ none
+
+
+
packagetests-enabled
@@ -746,6 +835,8 @@
truetruenone
+ none
+ nonenonenonenone
diff --git a/protected/gatk-package-distribution/pom.xml b/protected/gatk-package-distribution/pom.xml
index f8f530afa..c48ffad7b 100644
--- a/protected/gatk-package-distribution/pom.xml
+++ b/protected/gatk-package-distribution/pom.xml
@@ -5,7 +5,7 @@
org.broadinstitute.gatkgatk-aggregator
- 3.3
+ 3.4-SNAPSHOT../..
@@ -15,8 +15,6 @@
${project.basedir}/../..
- prepare-package
- packageorg.broadinstitute.gatk.engine.CommandLineGATKGenomeAnalysisTK
@@ -43,9 +41,14 @@
gatk-tools-protected${project.version}
+
+
+ org.slf4j
+ slf4j-log4j12
+
- samtools
+ com.github.samtoolshtsjdk
@@ -73,7 +76,7 @@
${project.groupId}
- gatk-engine
+ gatk-utils${project.version}example-resourcestar.bz2
@@ -164,6 +167,25 @@
+
+ org.apache.maven.plugins
+ maven-jar-plugin
+
+
+ executable-jar
+ ${gatk.jar.phase}
+
+
+ default-jar
+ none
+
+
+ unshaded-default-jar
+ ${gatk.jar.phase}
+
+
+
+
org.apache.maven.pluginsmaven-dependency-plugin
@@ -172,6 +194,10 @@
unpack-direct-dependencies${gatk.unpack.phase}
+
+ executable-jar-lib
+ ${gatk.jar.phase}
+
@@ -180,7 +206,7 @@
maven-shade-plugin
- gatk-executable
+ package-jar${gatk.shade.phase}
@@ -202,7 +228,11 @@
maven-junction-plugin
- link-binary-jar
+ link-executable-jar
+ ${gatk.jar.phase}
+
+
+ link-package-jar${gatk.shade.phase}
@@ -231,20 +261,6 @@
-
- packagetests-enabled
-
-
- gatk.packagetests.enabled
- true
-
-
-
- none
- none
- none
-
- gsadev
diff --git a/protected/gatk-package-distribution/src/main/assembly/binary-dist.xml b/protected/gatk-package-distribution/src/main/assembly/binary-dist.xml
deleted file mode 100644
index 11fb98e00..000000000
--- a/protected/gatk-package-distribution/src/main/assembly/binary-dist.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-
- binary-dist
-
- tar.bz2
-
- false
-
-
-
- org.broadinstitute.gatk:gatk-package-distribution
-
- ${gatk.binary-dist.name}.${artifact.extension}
-
-
- resources
- true
-
- org.broadinstitute.gatk:gatk-engine:tar.bz2:example-resources
-
-
-
-
diff --git a/protected/gatk-queue-extensions-distribution/pom.xml b/protected/gatk-queue-extensions-distribution/pom.xml
index 9f82d9edc..6b8e52749 100644
--- a/protected/gatk-queue-extensions-distribution/pom.xml
+++ b/protected/gatk-queue-extensions-distribution/pom.xml
@@ -5,7 +5,7 @@
org.broadinstitute.gatkgatk-aggregator
- 3.3
+ 3.4-SNAPSHOT../..
@@ -41,6 +41,10 @@
log4jlog4j
+
+ com.github.broadinstitute
+ picard
+ ${project.groupId}
- gatk-tools-public
+ gatk-utils${project.version}test-jartest
diff --git a/protected/gatk-queue-extensions-distribution/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/examples/ExampleFindCoveredIntervals.scala b/protected/gatk-queue-extensions-distribution/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/examples/ExampleFindCoveredIntervals.scala
new file mode 100644
index 000000000..48a393cb6
--- /dev/null
+++ b/protected/gatk-queue-extensions-distribution/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/examples/ExampleFindCoveredIntervals.scala
@@ -0,0 +1,76 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2014 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.queue.qscripts.examples
+
+import org.broadinstitute.gatk.queue.QScript
+import org.broadinstitute.gatk.queue.extensions.gatk._
+
+class ExampleFindCoveredIntervals extends QScript {
+ @Input(doc="The reference file for the bam files.", shortName="R")
+ var referenceFile: File = _
+
+ @Input(doc="Bam file to genotype.", shortName="I")
+ var bamFile: File = _
+
+ @Output(doc="Bam output", shortName="out")
+ var outFile: File = _
+
+ def script() {
+ val fci = new FindCoveredIntervals
+ fci.R = referenceFile
+ fci.memoryLimit = 2
+ fci.scatterCount = 3
+ fci.I :+= bamFile
+ fci.out = outFile
+ add(fci)
+ }
+}
diff --git a/protected/gatk-queue-extensions-distribution/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/examples/ExampleHaplotypeCaller.scala b/protected/gatk-queue-extensions-distribution/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/examples/ExampleHaplotypeCaller.scala
new file mode 100644
index 000000000..de7cdd4c6
--- /dev/null
+++ b/protected/gatk-queue-extensions-distribution/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/examples/ExampleHaplotypeCaller.scala
@@ -0,0 +1,80 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2014 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.queue.qscripts.examples
+
+import org.broadinstitute.gatk.queue.QScript
+import org.broadinstitute.gatk.queue.extensions.gatk._
+
+class ExampleHaplotypeCaller extends QScript {
+ @Input(doc="The reference file for the bam files.", shortName="R")
+ var referenceFile: File = _
+
+ @Input(doc="Bam file to genotype.", shortName="I")
+ var bamFile: File = _
+
+ @Output(doc="VCF output", shortName="out")
+ var outFile: File = _
+
+ @Argument(doc="One or more genomic intervals over which to operate", shortName="L", required=false)
+ var intervals: Seq[String] = Nil
+
+ def script() {
+ val hc = new HaplotypeCaller
+ hc.R = referenceFile
+ hc.memoryLimit = 2
+ hc.scatterCount = 3
+ hc.I :+= bamFile
+ hc.out = outFile
+ hc.intervalsString = intervals
+ add(hc)
+ }
+}
diff --git a/protected/gatk-queue-extensions-distribution/src/test/scala/org/broadinstitute/gatk/queue/pipeline/examples/ExampleUnifiedGenotyperQueueTest.scala b/protected/gatk-queue-extensions-distribution/src/test/scala/org/broadinstitute/gatk/queue/pipeline/examples/ExampleUnifiedGenotyperQueueTest.scala
index c4e81047f..d49bee6a5 100644
--- a/protected/gatk-queue-extensions-distribution/src/test/scala/org/broadinstitute/gatk/queue/pipeline/examples/ExampleUnifiedGenotyperQueueTest.scala
+++ b/protected/gatk-queue-extensions-distribution/src/test/scala/org/broadinstitute/gatk/queue/pipeline/examples/ExampleUnifiedGenotyperQueueTest.scala
@@ -91,7 +91,7 @@ class ExampleUnifiedGenotyperQueueTest {
" -R " + BaseTest.hg18Reference,
" -I " + BaseTest.validationDataLocation + "OV-0930.normal.chunk.bam",
" -L " + intervalsPath).mkString
- spec.jobRunners = Seq("Lsf706")
+ spec.jobRunners = Seq("GridEngine")
QueueTest.executeTest(spec)
}
diff --git a/protected/gatk-queue-extensions-distribution/src/test/scala/org/broadinstitute/gatk/queue/pipeline/examples/UnmappedExcludedQueueTest.scala b/protected/gatk-queue-extensions-distribution/src/test/scala/org/broadinstitute/gatk/queue/pipeline/examples/UnmappedExcludedQueueTest.scala
new file mode 100644
index 000000000..997f0e9ef
--- /dev/null
+++ b/protected/gatk-queue-extensions-distribution/src/test/scala/org/broadinstitute/gatk/queue/pipeline/examples/UnmappedExcludedQueueTest.scala
@@ -0,0 +1,93 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2014 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.queue.pipeline.examples
+
+import org.broadinstitute.gatk.queue.pipeline.{QueueTest, QueueTestSpec}
+import org.broadinstitute.gatk.utils.BaseTest
+import org.testng.annotations.Test
+
+class UnmappedExcludedQueueTest {
+
+ @Test(timeOut=36000000)
+ def testUnmappedExclusion(): Unit = {
+
+ //FindCoveredIntervals is an ActiveRegionWalker, which throws an exception if it encounters unmapped reads
+ //But it's partitioned by contigs, which by default includes unmapped reads. Verify that the unmapped reads
+ //are correctly not added in this case
+ val testOut = "fci.out"
+ val spec = new QueueTestSpec
+ spec.name = "findcoveredintervals"
+ spec.args = Array(
+ " -S " + QueueTest.protectedQScriptsPackageDir + "examples/ExampleFindCoveredIntervals.scala",
+ " -R " + BaseTest.publicTestDir + "exampleFASTA.fasta",
+ " -I " + BaseTest.publicTestDir + "exampleBAM.bam",
+ " -out " + testOut).mkString
+
+ //The output file is blank - the real test is simply that it runs to completion
+ spec.fileMD5s += testOut -> "d41d8cd98f00b204e9800998ecf8427e"
+ QueueTest.executeTest(spec)
+
+ //Regression Test: HaplotypeCaller is also an ActiveRegionWalker, and is much more widely used. Explicitly test
+ //it as well
+ val hcTestOut = "hctest.vcf"
+ val hcSpec = new QueueTestSpec
+ hcSpec.name = "haplotypecaller"
+ hcSpec.args = Array(
+ " -S " + QueueTest.protectedQScriptsPackageDir + "examples/ExampleHaplotypeCaller.scala",
+ " -R " + BaseTest.publicTestDir + "exampleFASTA.fasta",
+ " -I " + BaseTest.publicTestDir + "exampleBAM.bam",
+ " -out " + hcTestOut).mkString
+
+ //The output file is blank - the real test is simply that it runs to completion
+ QueueTest.executeTest(hcSpec)
+ }
+}
diff --git a/protected/gatk-queue-package-distribution/pom.xml b/protected/gatk-queue-package-distribution/pom.xml
index 20de01afb..cb653c91b 100644
--- a/protected/gatk-queue-package-distribution/pom.xml
+++ b/protected/gatk-queue-package-distribution/pom.xml
@@ -5,7 +5,7 @@
org.broadinstitute.gatkgatk-aggregator
- 3.3
+ 3.4-SNAPSHOT../..
@@ -15,8 +15,6 @@
${project.basedir}/../..
- prepare-package
- packageQueueorg.broadinstitute.gatk.queue.QCommandLine
@@ -49,7 +47,7 @@
- picard
+ com.github.broadinstitutepicard
@@ -80,7 +78,7 @@
${project.groupId}
- gatk-engine
+ gatk-utils${project.version}example-resourcestar.bz2
@@ -171,6 +169,25 @@
+
+ org.apache.maven.plugins
+ maven-jar-plugin
+
+
+ executable-jar
+ ${gatk.jar.phase}
+
+
+ default-jar
+ none
+
+
+ unshaded-default-jar
+ ${gatk.jar.phase}
+
+
+
+
org.apache.maven.pluginsmaven-dependency-plugin
@@ -179,6 +196,10 @@
unpack-direct-dependencies${gatk.unpack.phase}
+
+ executable-jar-lib
+ ${gatk.jar.phase}
+
@@ -187,7 +208,7 @@
maven-shade-plugin
- gatk-executable
+ package-jar${gatk.shade.phase}
@@ -209,7 +230,11 @@
maven-junction-plugin
- link-binary-jar
+ link-executable-jar
+ ${gatk.jar.phase}
+
+
+ link-package-jar${gatk.shade.phase}
@@ -238,20 +263,6 @@
-
- packagetests-enabled
-
-
- gatk.packagetests.enabled
- true
-
-
-
- none
- none
- none
-
- gsadev
diff --git a/protected/gatk-queue-package-distribution/src/main/assembly/binary-dist.xml b/protected/gatk-queue-package-distribution/src/main/assembly/binary-dist.xml
deleted file mode 100644
index daa974216..000000000
--- a/protected/gatk-queue-package-distribution/src/main/assembly/binary-dist.xml
+++ /dev/null
@@ -1,23 +0,0 @@
-
- binary-dist
-
- tar.bz2
-
- false
-
-
-
- org.broadinstitute.gatk:gatk-queue-package-distribution
-
- ${gatk.binary-dist.name}.${artifact.extension}
-
-
- resources
- true
-
- org.broadinstitute.gatk:gatk-engine:tar.bz2:example-resources
- org.broadinstitute.gatk:gatk-queue-extensions-public:tar.bz2:example-resources
-
-
-
-
diff --git a/protected/gatk-tools-protected/pom.xml b/protected/gatk-tools-protected/pom.xml
index 6f026c827..3df22c1a1 100644
--- a/protected/gatk-tools-protected/pom.xml
+++ b/protected/gatk-tools-protected/pom.xml
@@ -5,7 +5,7 @@
org.broadinstitute.gatkgatk-aggregator
- 3.3
+ 3.4-SNAPSHOT../..
@@ -48,7 +48,15 @@
${project.groupId}
- gatk-tools-public
+ gatk-utils
+ ${project.version}
+ test-jar
+ test
+
+
+
+ ${project.groupId}
+ gatk-engine${project.version}test-jartest
@@ -63,16 +71,6 @@
-
- org.apache.maven.plugins
- maven-resources-plugin
-
-
- copy-resource-bundle-log4j
- prepare-package
-
-
- org.apache.maven.pluginsmaven-javadoc-plugin
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java
deleted file mode 100644
index bf52849d6..000000000
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
-* By downloading the PROGRAM you agree to the following terms of use:
-*
-* BROAD INSTITUTE
-* SOFTWARE LICENSE AGREEMENT
-* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
-*
-* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
-*
-* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
-* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
-* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
-*
-* 1. DEFINITIONS
-* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
-*
-* 2. LICENSE
-* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
-* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
-* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
-* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
-*
-* 3. PHONE-HOME FEATURE
-* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
-*
-* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
-* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
-* Copyright 2012-2014 Broad Institute, Inc.
-* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
-* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
-*
-* 5. INDEMNIFICATION
-* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
-*
-* 6. NO REPRESENTATIONS OR WARRANTIES
-* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
-* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
-*
-* 7. ASSIGNMENT
-* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
-*
-* 8. MISCELLANEOUS
-* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
-* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
-* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
-* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
-* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
-* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
-* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
-*/
-
-package org.broadinstitute.gatk.engine.arguments;
-
-import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorImplementation;
-import org.broadinstitute.gatk.utils.commandline.*;
-import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypingOutputMode;
-import org.broadinstitute.gatk.tools.walkers.genotyper.OutputMode;
-import org.broadinstitute.gatk.utils.collections.DefaultHashMap;
-import htsjdk.variant.variantcontext.VariantContext;
-
-import java.io.File;
-import java.lang.reflect.Field;
-import java.lang.reflect.Method;
-import java.lang.reflect.Modifier;
-import java.util.Collections;
-import java.util.Map;
-
-/**
- * Created with IntelliJ IDEA.
- * User: rpoplin
- * Date: 8/20/12
- * A collection of arguments that are common to the various callers.
- * This is pulled out so that every caller isn't exposed to the arguments from every other caller.
- */
-
-public class StandardCallerArgumentCollection implements Cloneable {
-
- @ArgumentCollection
- public GenotypeCalculationArgumentCollection genotypeArgs = new GenotypeCalculationArgumentCollection();
-
- @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false)
- public GenotypingOutputMode genotypingOutputMode = GenotypingOutputMode.DISCOVERY;
-
- /**
- * When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding
- */
- @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", required=false)
- public RodBinding alleles;
-
- /**
- * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
- * Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we
- * will try to remove (N * contamination fraction) bases for each alternate allele.
- */
- @Argument(fullName = "contamination_fraction_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false)
- public double CONTAMINATION_FRACTION = DEFAULT_CONTAMINATION_FRACTION;
- public static final double DEFAULT_CONTAMINATION_FRACTION = 0.0;
-
- /**
- * This argument specifies a file with two columns "sample" and "contamination" specifying the contamination level for those samples.
- * Samples that do not appear in this file will be processed with CONTAMINATION_FRACTION.
- **/
- @Advanced
- @Argument(fullName = "contamination_fraction_per_sample_file", shortName = "contaminationFile", doc = "Tab-separated File containing fraction of contamination in sequencing data (per sample) to aggressively remove. Format should be \"\" (Contamination is double) per line; No header.", required = false)
- public File CONTAMINATION_FRACTION_FILE = null;
-
- /**
- * Indicates whether there is some sample contamination present.
- */
- private boolean sampleContaminationWasLoaded = false;
-
- /**
- *
- * @return an _Immutable_ copy of the Sample-Contamination Map, defaulting to CONTAMINATION_FRACTION so that if the sample isn't in the map map(sample)==CONTAMINATION_FRACTION
- */
- public Map getSampleContamination(){
- //make sure that the default value is set up right
- sampleContamination.setDefaultValue(CONTAMINATION_FRACTION);
- if (!Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0)
- sampleContaminationWasLoaded = true;
- return Collections.unmodifiableMap(sampleContamination);
- }
-
- public void setSampleContamination(DefaultHashMap sampleContamination) {
- this.sampleContamination.clear();
- this.sampleContaminationWasLoaded = !Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0;
- if (!sampleContaminationWasLoaded)
- for (final Double d : sampleContamination.values())
- if (!Double.isNaN(d) && d > 0.0) {
- sampleContaminationWasLoaded = true;
- break;
- }
- this.sampleContamination.putAll(sampleContamination);
- this.sampleContamination.setDefaultValue(CONTAMINATION_FRACTION);
- }
-
- /**
- * Returns true if there is some sample contamination present, false otherwise.
- * @return {@code true} iff there is some sample contamination
- */
- public boolean isSampleContaminationPresent() {
- return (!Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0) || sampleContaminationWasLoaded;
- }
-
- //Needs to be here because it uses CONTAMINATION_FRACTION
- private DefaultHashMap sampleContamination = new DefaultHashMap(CONTAMINATION_FRACTION);
-
- /**
- * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus.
- */
- @Hidden
- @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
- public AFCalculatorImplementation requestedAlleleFrequencyCalculationModel;
-
- @Hidden
- @Argument(shortName = "logExactCalls", doc="x", required=false)
- public File exactCallsLog = null;
-
- @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false)
- public OutputMode outputMode = OutputMode.EMIT_VARIANTS_ONLY;
-
- /**
- * Advanced, experimental argument: if SNP likelihood model is specified, and if EMIT_ALL_SITES output mode is set, when we set this argument then we will also emit PLs at all sites.
- * This will give a measure of reference confidence and a measure of which alt alleles are more plausible (if any).
- * WARNINGS:
- * - This feature will inflate VCF file size considerably.
- * - All SNP ALT alleles will be emitted with corresponding 10 PL values.
- * - An error will be emitted if EMIT_ALL_SITES is not set, or if anything other than diploid SNP model is used
- */
- @Advanced
- @Argument(fullName = "allSitePLs", shortName = "allSitePLs", doc = "Annotate all sites with PLs", required = false)
- public boolean annotateAllSitesWithPLs = false;
-
- /**
- * Creates a Standard caller argument collection with default values.
- */
- public StandardCallerArgumentCollection() { }
-
- /**
- * "Casts" a caller argument collection into another type.
- *
- *
Common fields values are copied across
- * @param clazz the class of the result.
- * @param result argument collection class.
- * @return never {@code null}.
- */
- public T cloneTo(final Class clazz) {
- // short cut: just use regular clone if it happens to be the same class.
- if (clazz == getClass())
- return (T) clone();
- try {
- final T result = clazz.newInstance();
- for (final Field field : getClass().getFields()) {
- // just copy common fields.
- if (!field.getDeclaringClass().isAssignableFrom(clazz))
- continue;
- final int fieldModifiers = field.getModifiers();
- if ((fieldModifiers & UNCOPYABLE_MODIFIER_MASK) != 0) continue;
- //Use the clone() method if appropriate
- if (Cloneable.class.isAssignableFrom(field.getType())) {
- Method clone = field.getType().getMethod("clone");
- field.set(result, clone.invoke(field.get(this)));
- } else
- field.set(result,field.get(this));
- }
- return result;
- } catch (final Exception ex) {
- throw new IllegalStateException(ex);
- }
- }
-
- /**
- * Creates a copy of this configuration.
- * @return never {@code null}.
- */
- @Override
- public StandardCallerArgumentCollection clone() {
- try {
- StandardCallerArgumentCollection cloned = (StandardCallerArgumentCollection) super.clone();
- cloned.genotypeArgs = genotypeArgs.clone();
- return cloned;
- } catch (CloneNotSupportedException e) {
- throw new IllegalStateException("unreachable code");
- }
- }
-
- /**
- * Holds a modifiers mask that identifies those fields that cannot be copied between
- * StandardCallerArgumentCollections.
- */
- private final int UNCOPYABLE_MODIFIER_MASK = Modifier.PRIVATE | Modifier.STATIC | Modifier.FINAL;
-}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRGatherer.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRGatherer.java
new file mode 100644
index 000000000..9ad2282ea
--- /dev/null
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRGatherer.java
@@ -0,0 +1,138 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2014 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.engine.recalibration;
+
+import org.apache.commons.collections.CollectionUtils;
+import org.apache.log4j.Logger;
+import org.broadinstitute.gatk.utils.commandline.Gatherer;
+import org.broadinstitute.gatk.utils.report.GATKReport;
+import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
+import org.broadinstitute.gatk.utils.exceptions.UserException;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.PrintStream;
+import java.util.*;
+
+/**
+ * User: carneiro
+ * Date: 3/29/11
+ */
+
+
+public class BQSRGatherer extends Gatherer {
+
+ private static final Logger logger = Logger.getLogger(BQSRGatherer.class);
+ private static final String EMPTY_INPUT_LIST = "list of inputs files is empty or there is no usable data in any input file";
+ private static final String MISSING_OUTPUT_FILE = "missing output file name";
+ private static final String MISSING_READ_GROUPS = "Missing read group(s)";
+
+ @Override
+ public void gather(final List inputs, final File output) {
+ final PrintStream outputFile;
+ try {
+ outputFile = new PrintStream(output);
+ } catch(FileNotFoundException e) {
+ throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE);
+ }
+ final GATKReport report = gatherReport(inputs);
+ report.print(outputFile);
+ }
+
+ /**
+ * Gathers the input recalibration reports into a single report.
+ *
+ * @param inputs Input recalibration GATK reports
+ * @return gathered recalibration GATK report
+ */
+ public static GATKReport gatherReport(final List inputs) {
+ final SortedSet allReadGroups = new TreeSet();
+ final LinkedHashMap> inputReadGroups = new LinkedHashMap>();
+
+ // Get the read groups from each input report
+ for (final File input : inputs) {
+ final Set readGroups = RecalibrationReport.getReadGroups(input);
+ inputReadGroups.put(input, readGroups);
+ allReadGroups.addAll(readGroups);
+ }
+
+ // Log the read groups that are missing from specific inputs
+ for (Map.Entry> entry: inputReadGroups.entrySet()) {
+ final File input = entry.getKey();
+ final Set readGroups = entry.getValue();
+ if (allReadGroups.size() != readGroups.size()) {
+ // Since this is not completely unexpected, more than debug, but less than a proper warning.
+ logger.info(MISSING_READ_GROUPS + ": " + input.getAbsolutePath());
+ for (final Object readGroup: CollectionUtils.subtract(allReadGroups, readGroups)) {
+ logger.info(" " + readGroup);
+ }
+ }
+ }
+
+ RecalibrationReport generalReport = null;
+ for (File input : inputs) {
+ final RecalibrationReport inputReport = new RecalibrationReport(input, allReadGroups);
+ if( inputReport.isEmpty() ) { continue; }
+
+ if (generalReport == null)
+ generalReport = inputReport;
+ else
+ generalReport.combine(inputReport);
+ }
+ if (generalReport == null)
+ throw new ReviewedGATKException(EMPTY_INPUT_LIST);
+
+ generalReport.calculateQuantizedQualities();
+
+ return generalReport.createGATKReport();
+ }
+}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRReadTransformer.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRReadTransformer.java
new file mode 100644
index 000000000..b524ad08a
--- /dev/null
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRReadTransformer.java
@@ -0,0 +1,104 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2014 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.engine.recalibration;
+
+import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
+import org.broadinstitute.gatk.engine.WalkerManager;
+import org.broadinstitute.gatk.engine.iterators.ReadTransformer;
+import org.broadinstitute.gatk.engine.walkers.Walker;
+import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
+
+/**
+ * A ReadTransformer that applies BQSR on the fly to reads
+ *
+ * User: rpoplin
+ * Date: 2/13/12
+ */
+public class BQSRReadTransformer extends ReadTransformer {
+ private boolean enabled;
+ private BaseRecalibration bqsr = null;
+
+ @Override
+ public OrderingConstraint getOrderingConstraint() { return OrderingConstraint.MUST_BE_FIRST; }
+
+ @Override
+ public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) {
+ this.enabled = engine.hasBQSRArgumentSet();
+ if ( enabled ) {
+ // TODO -- See important note below about applying BQSR to a reduced BAM file:
+ // If it is important to make sure that BQSR is not applied (as opposed to having the covariates computed) against a reduced bam file,
+ // we need to figure out how to make this work. The problem is that the ReadTransformers are initialized before the ReadDataSource
+ // inside the GenomeAnalysisEngine, so we generate a NPE when trying to retrieve the SAMFileHeaders. Ultimately, I don't think this is
+ // a necessary check anyways since we disallow running BaseRecalibrator on reduced bams (so we can't generate the recal tables to use here).
+ // Although we could add this check to the apply() method below, it's kind of ugly and inefficient.
+ // The call here would be: RecalUtils.checkForInvalidRecalBams(engine.getSAMFileHeaders(), engine.getArguments().ALLOW_BQSR_ON_REDUCED_BAMS);
+ final BQSRArgumentSet args = engine.getBQSRArgumentSet();
+ this.bqsr = new BaseRecalibration(args.getRecalFile(), args.getQuantizationLevels(), args.shouldDisableIndelQuals(), args.getPreserveQscoresLessThan(), args.shouldEmitOriginalQuals(), args.getGlobalQScorePrior());
+ }
+ final BQSRMode mode = WalkerManager.getWalkerAnnotation(walker, BQSRMode.class);
+ return mode.ApplicationTime();
+ }
+
+ @Override
+ public boolean enabled() {
+ return enabled;
+ }
+
+ /**
+ * initialize a new BQSRReadTransformer that applies BQSR on the fly to incoming reads.
+ */
+ @Override
+ public GATKSAMRecord apply(GATKSAMRecord read) {
+ bqsr.recalibrateRead(read);
+ return read;
+ }
+}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BaseRecalibration.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BaseRecalibration.java
new file mode 100644
index 000000000..9095f695e
--- /dev/null
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BaseRecalibration.java
@@ -0,0 +1,208 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2014 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.engine.recalibration;
+
+import com.google.java.contract.Ensures;
+import htsjdk.samtools.SAMTag;
+import htsjdk.samtools.SAMUtils;
+import org.apache.log4j.Logger;
+import org.broadinstitute.gatk.utils.MathUtils;
+import org.broadinstitute.gatk.utils.QualityUtils;
+import org.broadinstitute.gatk.utils.exceptions.UserException;
+import org.broadinstitute.gatk.utils.recalibration.EventType;
+import org.broadinstitute.gatk.engine.recalibration.covariates.Covariate;
+import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Utility methods to facilitate on-the-fly base quality score recalibration.
+ *
+ * User: carneiro and rpoplin
+ * Date: 2/4/12
+ */
+
+public class BaseRecalibration {
+ private static Logger logger = Logger.getLogger(BaseRecalibration.class);
+ private final static boolean TEST_CACHING = false;
+
+ private final QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done)
+ private final RecalibrationTables recalibrationTables;
+ private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation
+
+ private final boolean disableIndelQuals;
+ private final int preserveQLessThan;
+ private final double globalQScorePrior;
+ private final boolean emitOriginalQuals;
+
+ /**
+ * Constructor using a GATK Report file
+ *
+ * @param RECAL_FILE a GATK Report file containing the recalibration information
+ * @param quantizationLevels number of bins to quantize the quality scores
+ * @param disableIndelQuals if true, do not emit base indel qualities
+ * @param preserveQLessThan preserve quality scores less than this value
+ */
+ public BaseRecalibration(final File RECAL_FILE, final int quantizationLevels, final boolean disableIndelQuals, final int preserveQLessThan, final boolean emitOriginalQuals, final double globalQScorePrior) {
+ RecalibrationReport recalibrationReport = new RecalibrationReport(RECAL_FILE);
+
+ recalibrationTables = recalibrationReport.getRecalibrationTables();
+ requestedCovariates = recalibrationReport.getRequestedCovariates();
+ quantizationInfo = recalibrationReport.getQuantizationInfo();
+ if (quantizationLevels == 0) // quantizationLevels == 0 means no quantization, preserve the quality scores
+ quantizationInfo.noQuantization();
+ else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wants to use what's in the report.
+ quantizationInfo.quantizeQualityScores(quantizationLevels);
+
+ this.disableIndelQuals = disableIndelQuals;
+ this.preserveQLessThan = preserveQLessThan;
+ this.globalQScorePrior = globalQScorePrior;
+ this.emitOriginalQuals = emitOriginalQuals;
+ }
+
+ /**
+ * Recalibrates the base qualities of a read
+ *
+ * It updates the base qualities of the read with the new recalibrated qualities (for all event types)
+ *
+ * Implements a serial recalibration of the reads using the combinational table.
+ * First, we perform a positional recalibration, and then a subsequent dinuc correction.
+ *
+ * Given the full recalibration table, we perform the following preprocessing steps:
+ *
+ * - calculate the global quality score shift across all data [DeltaQ]
+ * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
+ * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
+ * - The final shift equation is:
+ *
+ * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
+ *
+ * @param read the read to recalibrate
+ */
+ public void recalibrateRead(final GATKSAMRecord read) {
+ if (emitOriginalQuals && read.getAttribute(SAMTag.OQ.name()) == null) { // Save the old qualities if the tag isn't already taken in the read
+ try {
+ read.setAttribute(SAMTag.OQ.name(), SAMUtils.phredToFastq(read.getBaseQualities()));
+ } catch (IllegalArgumentException e) {
+ throw new UserException.MalformedBAM(read, "illegal base quality encountered; " + e.getMessage());
+ }
+ }
+
+ final ReadCovariates readCovariates = RecalUtils.computeCovariates(read, requestedCovariates);
+ final int readLength = read.getReadLength();
+
+ for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings
+ if (disableIndelQuals && errorModel != EventType.BASE_SUBSTITUTION) {
+ read.setBaseQualities(null, errorModel);
+ continue;
+ }
+
+ final byte[] quals = read.getBaseQualities(errorModel);
+
+ // get the keyset for this base using the error model
+ final int[][] fullReadKeySet = readCovariates.getKeySet(errorModel);
+
+ // the rg key is constant over the whole read, the global deltaQ is too
+ final int rgKey = fullReadKeySet[0][0];
+ final RecalDatum empiricalQualRG = recalibrationTables.getReadGroupTable().get(rgKey, errorModel.ordinal());
+
+ if( empiricalQualRG != null ) {
+ final double epsilon = ( globalQScorePrior > 0.0 && errorModel.equals(EventType.BASE_SUBSTITUTION) ? globalQScorePrior : empiricalQualRG.getEstimatedQReported() );
+
+ for (int offset = 0; offset < readLength; offset++) { // recalibrate all bases in the read
+ final byte origQual = quals[offset];
+
+ // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality)
+ if ( origQual >= preserveQLessThan ) {
+ // get the keyset for this base using the error model
+ final int[] keySet = fullReadKeySet[offset];
+ final RecalDatum empiricalQualQS = recalibrationTables.getQualityScoreTable().get(keySet[0], keySet[1], errorModel.ordinal());
+ final List empiricalQualCovs = new ArrayList();
+ for (int i = 2; i < requestedCovariates.length; i++) {
+ if (keySet[i] < 0) {
+ continue;
+ }
+ empiricalQualCovs.add(recalibrationTables.getTable(i).get(keySet[0], keySet[1], keySet[i], errorModel.ordinal()));
+ }
+
+ double recalibratedQualDouble = hierarchicalBayesianQualityEstimate( epsilon, empiricalQualRG, empiricalQualQS, empiricalQualCovs );
+
+ // recalibrated quality is bound between 1 and MAX_QUAL
+ final byte recalibratedQual = QualityUtils.boundQual(MathUtils.fastRound(recalibratedQualDouble), RecalDatum.MAX_RECALIBRATED_Q_SCORE);
+
+ // return the quantized version of the recalibrated quality
+ final byte recalibratedQualityScore = quantizationInfo.getQuantizedQuals().get(recalibratedQual);
+
+ quals[offset] = recalibratedQualityScore;
+ }
+ }
+ }
+
+ // finally update the base qualities in the read
+ read.setBaseQualities(quals, errorModel);
+ }
+ }
+
+ @Ensures("result > 0.0")
+ protected static double hierarchicalBayesianQualityEstimate( final double epsilon, final RecalDatum empiricalQualRG, final RecalDatum empiricalQualQS, final List empiricalQualCovs ) {
+ final double globalDeltaQ = ( empiricalQualRG == null ? 0.0 : empiricalQualRG.getEmpiricalQuality(epsilon) - epsilon );
+ final double deltaQReported = ( empiricalQualQS == null ? 0.0 : empiricalQualQS.getEmpiricalQuality(globalDeltaQ + epsilon) - (globalDeltaQ + epsilon) );
+ double deltaQCovariates = 0.0;
+ for( final RecalDatum empiricalQualCov : empiricalQualCovs ) {
+ deltaQCovariates += ( empiricalQualCov == null ? 0.0 : empiricalQualCov.getEmpiricalQuality(deltaQReported + globalDeltaQ + epsilon) - (deltaQReported + globalDeltaQ + epsilon) );
+ }
+
+ return epsilon + globalDeltaQ + deltaQReported + deltaQCovariates;
+ }
+}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/QualQuantizer.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/QualQuantizer.java
new file mode 100644
index 000000000..b01359fca
--- /dev/null
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/QualQuantizer.java
@@ -0,0 +1,500 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2014 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.engine.recalibration;
+
+import com.google.java.contract.Ensures;
+import com.google.java.contract.Invariant;
+import com.google.java.contract.Requires;
+import org.apache.log4j.Logger;
+import org.broadinstitute.gatk.utils.report.GATKReport;
+import org.broadinstitute.gatk.utils.report.GATKReportTable;
+import org.broadinstitute.gatk.utils.QualityUtils;
+import org.broadinstitute.gatk.utils.Utils;
+import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
+
+import java.io.PrintStream;
+import java.util.*;
+
+/**
+ * A general algorithm for quantizing quality score distributions to use a specific number of levels
+ *
+ * Takes a histogram of quality scores and a desired number of levels and produces a
+ * map from original quality scores -> quantized quality scores.
+ *
+ * Note that this data structure is fairly heavy-weight, holding lots of debugging and
+ * calculation information. If you want to use it efficiently at scale with lots of
+ * read groups the right way to do this:
+ *
+ * Map> map
+ * for each read group rg:
+ * hist = getQualHist(rg)
+ * QualQuantizer qq = new QualQuantizer(hist, nLevels, minInterestingQual)
+ * map.set(rg, qq.getOriginalToQuantizedMap())
+ *
+ * This map would then be used to look up the appropriate original -> quantized
+ * quals for each read as it comes in.
+ *
+ * @author Mark Depristo
+ * @since 3/2/12
+ */
+public class QualQuantizer {
+ final private static Set MY_EMPTY_SET = Collections.emptySet();
+
+ private static Logger logger = Logger.getLogger(QualQuantizer.class);
+
+ /**
+ * Inputs to the QualQuantizer
+ */
+ final int nLevels, minInterestingQual;
+ final List nObservationsPerQual;
+
+ /**
+ * Map from original qual (e.g., Q30) to new quantized qual (e.g., Q28).
+ *
+ * Has the same range as nObservationsPerQual
+ */
+ final List originalToQuantizedMap;
+
+ /** Sorted set of qual intervals.
+ *
+ * After quantize() this data structure contains only the top-level qual intervals
+ */
+ final TreeSet quantizedIntervals;
+
+ /**
+ * Protected creator for testng use only
+ */
+ protected QualQuantizer(final int minInterestingQual) {
+ this.nObservationsPerQual = Collections.emptyList();
+ this.nLevels = 0;
+ this.minInterestingQual = minInterestingQual;
+ this.quantizedIntervals = null;
+ this.originalToQuantizedMap = null;
+ }
+
+ /**
+ * Creates a QualQuantizer for the histogram that has nLevels
+ *
+ * Note this is the only interface to the system. After creating this object
+ * the map can be obtained via getOriginalToQuantizedMap()
+ *
+ * @param nObservationsPerQual A histogram of counts of bases with quality scores. Note that
+ * this histogram must start at 0 (i.e., get(0) => count of Q0 bases) and must include counts all the
+ * way up to the largest quality score possible in the reads. OK if the histogram includes many 0
+ * count bins, as these are quantized for free.
+ * @param nLevels the desired number of distinct quality scores to represent the full original range. Must
+ * be at least 1.
+ * @param minInterestingQual All quality scores <= this value are considered uninteresting and are freely
+ * merged together. For example, if this value is 10, then Q0-Q10 are all considered free to merge, and
+ * quantized into a single value. For ILMN data with lots of Q2 bases this results in a Q2 bin containing
+ * all data with Q0-Q10.
+ */
+ public QualQuantizer(final List nObservationsPerQual, final int nLevels, final int minInterestingQual) {
+ this.nObservationsPerQual = nObservationsPerQual;
+ this.nLevels = nLevels;
+ this.minInterestingQual = minInterestingQual;
+
+ // some sanity checking
+ if ( Collections.min(nObservationsPerQual) < 0 ) throw new ReviewedGATKException("Quality score histogram has negative values at: " + Utils.join(", ", nObservationsPerQual));
+ if ( nLevels < 0 ) throw new ReviewedGATKException("nLevels must be >= 0");
+ if ( minInterestingQual < 0 ) throw new ReviewedGATKException("minInterestingQual must be >= 0");
+
+ // actually run the quantizer
+ this.quantizedIntervals = quantize();
+
+ // store the map
+ this.originalToQuantizedMap = intervalsToMap(quantizedIntervals);
+ }
+
+ /**
+ * Represents an contiguous interval of quality scores.
+ *
+ * qStart and qEnd are inclusive, so qStart = qEnd = 2 is the quality score bin of 2
+ */
+ @Invariant({
+ "qStart <= qEnd",
+ "qStart >= 0",
+ "qEnd <= 1000",
+ "nObservations >= 0",
+ "nErrors >= 0",
+ "nErrors <= nObservations",
+ "fixedQual >= -1 && fixedQual <= QualityUtils.MAX_SAM_QUAL_SCORE",
+ "mergeOrder >= 0"})
+ protected final class QualInterval implements Comparable {
+ final int qStart, qEnd, fixedQual, level;
+ final long nObservations, nErrors;
+ final Set subIntervals;
+
+ /** for debugging / visualization. When was this interval created? */
+ int mergeOrder;
+
+ protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level) {
+ this(qStart, qEnd, nObservations, nErrors, level, -1, MY_EMPTY_SET);
+ }
+
+ protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final Set subIntervals) {
+ this(qStart, qEnd, nObservations, nErrors, level, -1, subIntervals);
+ }
+
+ protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final int fixedQual) {
+ this(qStart, qEnd, nObservations, nErrors, level, fixedQual, MY_EMPTY_SET);
+ }
+
+ @Requires("level >= 0")
+ public QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final int fixedQual, final Set subIntervals) {
+ this.qStart = qStart;
+ this.qEnd = qEnd;
+ this.nObservations = nObservations;
+ this.nErrors = nErrors;
+ this.fixedQual = fixedQual;
+ this.level = level;
+ this.mergeOrder = 0;
+ this.subIntervals = Collections.unmodifiableSet(subIntervals);
+ }
+
+ /**
+ * @return Human readable name of this interval: e.g., 10-12
+ */
+ public String getName() {
+ return qStart + "-" + qEnd;
+ }
+
+ @Override
+ public String toString() {
+ return "QQ:" + getName();
+ }
+
+ /**
+ * @return the error rate (in real space) of this interval, or 0 if there are no observations
+ */
+ @Ensures("result >= 0.0")
+ public double getErrorRate() {
+ if ( hasFixedQual() )
+ return QualityUtils.qualToErrorProb((byte)fixedQual);
+ else if ( nObservations == 0 )
+ return 0.0;
+ else
+ return (nErrors+1) / (1.0 * (nObservations+1));
+ }
+
+ /**
+ * @return the QUAL of the error rate of this interval, or the fixed qual if this interval was created with a fixed qual.
+ */
+ @Ensures("result >= 0 && result <= QualityUtils.MAX_SAM_QUAL_SCORE")
+ public byte getQual() {
+ if ( ! hasFixedQual() )
+ return QualityUtils.errorProbToQual(getErrorRate());
+ else
+ return (byte)fixedQual;
+ }
+
+ /**
+ * @return true if this bin is using a fixed qual
+ */
+ public boolean hasFixedQual() {
+ return fixedQual != -1;
+ }
+
+ @Override
+ public int compareTo(final QualInterval qualInterval) {
+ return Integer.valueOf(this.qStart).compareTo(qualInterval.qStart);
+ }
+
+ /**
+ * Create a interval representing the merge of this interval and toMerge
+ *
+ * Errors and observations are combined
+ * Subintervals updated in order of left to right (determined by qStart)
+ * Level is 1 + highest level of this and toMerge
+ * Order must be updated elsewhere
+ *
+ * @param toMerge
+ * @return newly created merged QualInterval
+ */
+ @Requires({"toMerge != null"})
+ @Ensures({
+ "result != null",
+ "result.nObservations >= this.nObservations",
+ "result.nObservations >= toMerge.nObservations",
+ "result.nErrors >= this.nErrors",
+ "result.nErrors >= toMerge.nErrors",
+ "result.qStart == Math.min(this.qStart, toMerge.qStart)",
+ "result.qEnd == Math.max(this.qEnd, toMerge.qEnd)",
+ "result.level > Math.max(this.level, toMerge.level)",
+ "result.subIntervals.size() == 2"
+ })
+ public QualInterval merge(final QualInterval toMerge) {
+ final QualInterval left = this.compareTo(toMerge) < 0 ? this : toMerge;
+ final QualInterval right = this.compareTo(toMerge) < 0 ? toMerge : this;
+
+ if ( left.qEnd + 1 != right.qStart )
+ throw new ReviewedGATKException("Attempting to merge non-contiguous intervals: left = " + left + " right = " + right);
+
+ final long nCombinedObs = left.nObservations + right.nObservations;
+ final long nCombinedErr = left.nErrors + right.nErrors;
+
+ final int level = Math.max(left.level, right.level) + 1;
+ final Set subIntervals = new HashSet(Arrays.asList(left, right));
+ QualInterval merged = new QualInterval(left.qStart, right.qEnd, nCombinedObs, nCombinedErr, level, subIntervals);
+
+ return merged;
+ }
+
+ public double getPenalty() {
+ return calcPenalty(getErrorRate());
+ }
+
+
+ /**
+ * Calculate the penalty of this interval, given the overall error rate for the interval
+ *
+ * If the globalErrorRate is e, this value is:
+ *
+ * sum_i |log10(e_i) - log10(e)| * nObservations_i
+ *
+ * each the index i applies to all leaves of the tree accessible from this interval
+ * (found recursively from subIntervals as necessary)
+ *
+ * @param globalErrorRate overall error rate in real space against which we calculate the penalty
+ * @return the cost of approximating the bins in this interval with the globalErrorRate
+ */
+ @Requires("globalErrorRate >= 0.0")
+ @Ensures("result >= 0.0")
+ private double calcPenalty(final double globalErrorRate) {
+ if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty
+ return 0.0;
+
+ if ( subIntervals.isEmpty() ) {
+ // this is leave node
+ if ( this.qEnd <= minInterestingQual )
+ // It's free to merge up quality scores below the smallest interesting one
+ return 0;
+ else {
+ return (Math.abs(Math.log10(getErrorRate()) - Math.log10(globalErrorRate))) * nObservations;
+ }
+ } else {
+ double sum = 0;
+ for ( final QualInterval interval : subIntervals )
+ sum += interval.calcPenalty(globalErrorRate);
+ return sum;
+ }
+ }
+ }
+
+ /**
+ * Main method for computing the quantization intervals.
+ *
+ * Invoked in the constructor after all input variables are initialized. Walks
+ * over the inputs and builds the min. penalty forest of intervals with exactly nLevel
+ * root nodes. Finds this min. penalty forest via greedy search, so is not guarenteed
+ * to find the optimal combination.
+ *
+ * TODO: develop a smarter algorithm
+ *
+ * @return the forest of intervals with size == nLevels
+ */
+ @Ensures({"! result.isEmpty()", "result.size() == nLevels"})
+ private TreeSet quantize() {
+ // create intervals for each qual individually
+ final TreeSet intervals = new TreeSet();
+ for ( int qStart = 0; qStart < getNQualsInHistogram(); qStart++ ) {
+ final long nObs = nObservationsPerQual.get(qStart);
+ final double errorRate = QualityUtils.qualToErrorProb((byte)qStart);
+ final double nErrors = nObs * errorRate;
+ final QualInterval qi = new QualInterval(qStart, qStart, nObs, (int)Math.floor(nErrors), 0, (byte)qStart);
+ intervals.add(qi);
+ }
+
+ // greedy algorithm:
+ // while ( n intervals >= nLevels ):
+ // find intervals to merge with least penalty
+ // merge it
+ while ( intervals.size() > nLevels ) {
+ mergeLowestPenaltyIntervals(intervals);
+ }
+
+ return intervals;
+ }
+
+ /**
+ * Helper function that finds and merges together the lowest penalty pair of intervals
+ * @param intervals
+ */
+ @Requires("! intervals.isEmpty()")
+ private void mergeLowestPenaltyIntervals(final TreeSet intervals) {
+ // setup the iterators
+ final Iterator it1 = intervals.iterator();
+ final Iterator it1p = intervals.iterator();
+ it1p.next(); // skip one
+
+ // walk over the pairs of left and right, keeping track of the pair with the lowest merge penalty
+ QualInterval minMerge = null;
+ if ( logger.isDebugEnabled() ) logger.debug("mergeLowestPenaltyIntervals: " + intervals.size());
+ int lastMergeOrder = 0;
+ while ( it1p.hasNext() ) {
+ final QualInterval left = it1.next();
+ final QualInterval right = it1p.next();
+ final QualInterval merged = left.merge(right);
+ lastMergeOrder = Math.max(Math.max(lastMergeOrder, left.mergeOrder), right.mergeOrder);
+ if ( minMerge == null || (merged.getPenalty() < minMerge.getPenalty() ) ) {
+ if ( logger.isDebugEnabled() ) logger.debug(" Updating merge " + minMerge);
+ minMerge = merged;
+ }
+ }
+
+ // now actually go ahead and merge the minMerge pair
+ if ( logger.isDebugEnabled() ) logger.debug(" => final min merge " + minMerge);
+ intervals.removeAll(minMerge.subIntervals);
+ intervals.add(minMerge);
+ minMerge.mergeOrder = lastMergeOrder + 1;
+ if ( logger.isDebugEnabled() ) logger.debug("updated intervals: " + intervals);
+ }
+
+ /**
+ * Given a final forest of intervals constructs a list mapping
+ * list.get(i) => quantized qual to use for original quality score i
+ *
+ * This function should be called only once to initialize the corresponding
+ * cached value in this object, as the calculation is a bit costly.
+ *
+ * @param intervals
+ * @return
+ */
+ @Ensures("result.size() == getNQualsInHistogram()")
+ private List intervalsToMap(final TreeSet intervals) {
+ final List map = new ArrayList(getNQualsInHistogram());
+ map.addAll(Collections.nCopies(getNQualsInHistogram(), Byte.MIN_VALUE));
+ for ( final QualInterval interval : intervals ) {
+ for ( int q = interval.qStart; q <= interval.qEnd; q++ ) {
+ map.set(q, interval.getQual());
+ }
+ }
+
+ if ( Collections.min(map) == Byte.MIN_VALUE )
+ throw new ReviewedGATKException("quantized quality score map contains an un-initialized value");
+
+ return map;
+ }
+
+ @Ensures("result > 0")
+ private final int getNQualsInHistogram() {
+ return nObservationsPerQual.size();
+ }
+
+ /**
+ * Write out a GATKReport to visualize the QualQuantization process of this data
+ * @param out
+ */
+ public void writeReport(PrintStream out) {
+ final GATKReport report = new GATKReport();
+
+ addQualHistogramToReport(report);
+ addIntervalsToReport(report);
+
+ report.print(out);
+ }
+
+ private final void addQualHistogramToReport(final GATKReport report) {
+ report.addTable("QualHistogram", "Quality score histogram provided to report", 2);
+ GATKReportTable table = report.getTable("QualHistogram");
+
+ table.addColumn("qual");
+ table.addColumn("count");
+
+ for ( int q = 0; q < nObservationsPerQual.size(); q++ ) {
+ table.set(q, "qual", q);
+ table.set(q, "count", nObservationsPerQual.get(q));
+ }
+ }
+
+
+ private final void addIntervalsToReport(final GATKReport report) {
+ report.addTable("QualQuantizerIntervals", "Table of QualQuantizer quantization intervals", 10);
+ GATKReportTable table = report.getTable("QualQuantizerIntervals");
+
+ table.addColumn("name");
+ table.addColumn("qStart");
+ table.addColumn("qEnd");
+ table.addColumn("level");
+ table.addColumn("merge.order");
+ table.addColumn("nErrors");
+ table.addColumn("nObservations");
+ table.addColumn("qual");
+ table.addColumn("penalty");
+ table.addColumn("root.node");
+ //table.addColumn("subintervals", "NA");
+
+ for ( QualInterval interval : quantizedIntervals )
+ addIntervalToReport(table, interval, true);
+ }
+
+ private final void addIntervalToReport(final GATKReportTable table, final QualInterval interval, final boolean atRootP) {
+ final String name = interval.getName();
+ table.set(name, "name", name);
+ table.set(name, "qStart", interval.qStart);
+ table.set(name, "qEnd", interval.qEnd);
+ table.set(name, "level", interval.level);
+ table.set(name, "merge.order", interval.mergeOrder);
+ table.set(name, "nErrors", interval.nErrors);
+ table.set(name, "nObservations", interval.nObservations);
+ table.set(name, "qual", interval.getQual());
+ table.set(name, "penalty", String.format("%.1f", interval.getPenalty()));
+ table.set(name, "root.node", atRootP);
+
+ for ( final QualInterval sub : interval.subIntervals )
+ addIntervalToReport(table, sub, false);
+ }
+
+ public List getOriginalToQuantizedMap() {
+ return originalToQuantizedMap;
+ }
+}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/QuantizationInfo.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/QuantizationInfo.java
new file mode 100644
index 000000000..e054805af
--- /dev/null
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/QuantizationInfo.java
@@ -0,0 +1,151 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2014 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.engine.recalibration;
+
+import org.broadinstitute.gatk.utils.report.GATKReportTable;
+import org.broadinstitute.gatk.utils.MathUtils;
+import org.broadinstitute.gatk.utils.QualityUtils;
+import org.broadinstitute.gatk.utils.collections.NestedIntegerArray;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Class that encapsulates the information necessary for quality score quantization for BQSR
+ *
+ * @author carneiro
+ * @since 3/26/12
+ */
+public class QuantizationInfo {
+ private List quantizedQuals;
+ private List empiricalQualCounts;
+ private int quantizationLevels;
+
+ private QuantizationInfo(List quantizedQuals, List empiricalQualCounts, int quantizationLevels) {
+ this.quantizedQuals = quantizedQuals;
+ this.empiricalQualCounts = empiricalQualCounts;
+ this.quantizationLevels = quantizationLevels;
+ }
+
+ public QuantizationInfo(List quantizedQuals, List empiricalQualCounts) {
+ this(quantizedQuals, empiricalQualCounts, calculateQuantizationLevels(quantizedQuals));
+ }
+
+ public QuantizationInfo(final RecalibrationTables recalibrationTables, final int quantizationLevels) {
+ final Long [] qualHistogram = new Long[QualityUtils.MAX_SAM_QUAL_SCORE +1]; // create a histogram with the empirical quality distribution
+ for (int i = 0; i < qualHistogram.length; i++)
+ qualHistogram[i] = 0L;
+
+ final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); // get the quality score table
+
+ for (final RecalDatum value : qualTable.getAllValues()) {
+ final RecalDatum datum = value;
+ final int empiricalQual = MathUtils.fastRound(datum.getEmpiricalQuality()); // convert the empirical quality to an integer ( it is already capped by MAX_QUAL )
+ qualHistogram[empiricalQual] += (long) datum.getNumObservations(); // add the number of observations for every key
+ }
+ empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities
+ quantizeQualityScores(quantizationLevels);
+
+ this.quantizationLevels = quantizationLevels;
+ }
+
+
+ public void quantizeQualityScores(int nLevels) {
+ QualQuantizer quantizer = new QualQuantizer(empiricalQualCounts, nLevels, QualityUtils.MIN_USABLE_Q_SCORE); // quantize the qualities to the desired number of levels
+ quantizedQuals = quantizer.getOriginalToQuantizedMap(); // map with the original to quantized qual map (using the standard number of levels in the RAC)
+ }
+
+ public void noQuantization() {
+ this.quantizationLevels = QualityUtils.MAX_SAM_QUAL_SCORE;
+ for (int i = 0; i < this.quantizationLevels; i++)
+ quantizedQuals.set(i, (byte) i);
+ }
+
+ public List getQuantizedQuals() {
+ return quantizedQuals;
+ }
+
+ public int getQuantizationLevels() {
+ return quantizationLevels;
+ }
+
+ public GATKReportTable generateReportTable(boolean sortByCols) {
+ GATKReportTable quantizedTable;
+ if(sortByCols) {
+ quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3, GATKReportTable.TableSortingWay.SORT_BY_COLUMN);
+ } else {
+ quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3);
+ }
+ quantizedTable.addColumn(RecalUtils.QUALITY_SCORE_COLUMN_NAME);
+ quantizedTable.addColumn(RecalUtils.QUANTIZED_COUNT_COLUMN_NAME);
+ quantizedTable.addColumn(RecalUtils.QUANTIZED_VALUE_COLUMN_NAME);
+
+ for (int qual = 0; qual <= QualityUtils.MAX_SAM_QUAL_SCORE; qual++) {
+ quantizedTable.set(qual, RecalUtils.QUALITY_SCORE_COLUMN_NAME, qual);
+ quantizedTable.set(qual, RecalUtils.QUANTIZED_COUNT_COLUMN_NAME, empiricalQualCounts.get(qual));
+ quantizedTable.set(qual, RecalUtils.QUANTIZED_VALUE_COLUMN_NAME, quantizedQuals.get(qual));
+ }
+ return quantizedTable;
+ }
+
+ private static int calculateQuantizationLevels(List quantizedQuals) {
+ byte lastByte = -1;
+ int quantizationLevels = 0;
+ for (byte q : quantizedQuals) {
+ if (q != lastByte) {
+ quantizationLevels++;
+ lastByte = q;
+ }
+ }
+ return quantizationLevels;
+ }
+}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/ReadCovariates.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/ReadCovariates.java
new file mode 100644
index 000000000..c02dd4881
--- /dev/null
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/ReadCovariates.java
@@ -0,0 +1,176 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2014 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.engine.recalibration;
+
+import org.apache.log4j.Logger;
+import org.broadinstitute.gatk.utils.LRUCache;
+import org.broadinstitute.gatk.utils.recalibration.EventType;
+
+/**
+ * The object temporarily held by a read that describes all of it's covariates.
+ *
+ * In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap
+ *
+ * @author Mauricio Carneiro
+ * @since 2/8/12
+ */
+public class ReadCovariates {
+ private final static Logger logger = Logger.getLogger(ReadCovariates.class);
+
+ /**
+ * How big should we let the LRU cache grow
+ */
+ private static final int LRU_CACHE_SIZE = 500;
+
+ /**
+ * Use an LRU cache to keep cache of keys (int[][][]) arrays for each read length we've seen.
+ * The cache allows us to avoid the expense of recreating these arrays for every read. The LRU
+ * keeps the total number of cached arrays to less than LRU_CACHE_SIZE.
+ *
+ * This is a thread local variable, so the total memory required may grow to N_THREADS x LRU_CACHE_SIZE
+ */
+ private final static ThreadLocal> keysCache = new ThreadLocal>() {
+ @Override protected LRUCache initialValue() {
+ return new LRUCache(LRU_CACHE_SIZE);
+ }
+ };
+
+ /**
+ * The keys cache is only valid for a single covariate count. Normally this will remain constant for the analysis.
+ * If running multiple analyses (or the unit test suite), it's necessary to clear the cache.
+ */
+ public static void clearKeysCache() {
+ keysCache.remove();
+ }
+
+ /**
+ * Our keys, indexed by event type x read length x covariate
+ */
+ private final int[][][] keys;
+
+ /**
+ * The index of the current covariate, used by addCovariate
+ */
+ private int currentCovariateIndex = 0;
+
+ public ReadCovariates(final int readLength, final int numberOfCovariates) {
+ final LRUCache cache = keysCache.get();
+ final int[][][] cachedKeys = cache.get(readLength);
+ if ( cachedKeys == null ) {
+ // There's no cached value for read length so we need to create a new int[][][] array
+ if ( logger.isDebugEnabled() ) logger.debug("Keys cache miss for length " + readLength + " cache size " + cache.size());
+ keys = new int[EventType.values().length][readLength][numberOfCovariates];
+ cache.put(readLength, keys);
+ } else {
+ keys = cachedKeys;
+ }
+ }
+
+ public void setCovariateIndex(final int index) {
+ currentCovariateIndex = index;
+ }
+
+ /**
+ * Update the keys for mismatch, insertion, and deletion for the current covariate at read offset
+ *
+ * NOTE: no checks are performed on the number of covariates, for performance reasons. If the count increases
+ * after the keysCache has been accessed, this method will throw an ArrayIndexOutOfBoundsException. This currently
+ * only occurs in the testing harness, and we don't anticipate that it will become a part of normal runs.
+ *
+ * @param mismatch the mismatch key value
+ * @param insertion the insertion key value
+ * @param deletion the deletion key value
+ * @param readOffset the read offset, must be >= 0 and <= the read length used to create this ReadCovariates
+ */
+ public void addCovariate(final int mismatch, final int insertion, final int deletion, final int readOffset) {
+ keys[EventType.BASE_SUBSTITUTION.ordinal()][readOffset][currentCovariateIndex] = mismatch;
+ keys[EventType.BASE_INSERTION.ordinal()][readOffset][currentCovariateIndex] = insertion;
+ keys[EventType.BASE_DELETION.ordinal()][readOffset][currentCovariateIndex] = deletion;
+ }
+
+ /**
+ * Get the keys for all covariates at read position for error model
+ *
+ * @param readPosition
+ * @param errorModel
+ * @return
+ */
+ public int[] getKeySet(final int readPosition, final EventType errorModel) {
+ return keys[errorModel.ordinal()][readPosition];
+ }
+
+ public int[][] getKeySet(final EventType errorModel) {
+ return keys[errorModel.ordinal()];
+ }
+
+ // ----------------------------------------------------------------------
+ //
+ // routines for testing
+ //
+ // ----------------------------------------------------------------------
+
+ protected int[][] getMismatchesKeySet() { return getKeySet(EventType.BASE_SUBSTITUTION); }
+ protected int[][] getInsertionsKeySet() { return getKeySet(EventType.BASE_INSERTION); }
+ protected int[][] getDeletionsKeySet() { return getKeySet(EventType.BASE_DELETION); }
+
+ protected int[] getMismatchesKeySet(final int readPosition) {
+ return getKeySet(readPosition, EventType.BASE_SUBSTITUTION);
+ }
+
+ protected int[] getInsertionsKeySet(final int readPosition) {
+ return getKeySet(readPosition, EventType.BASE_INSERTION);
+ }
+
+ protected int[] getDeletionsKeySet(final int readPosition) {
+ return getKeySet(readPosition, EventType.BASE_DELETION);
+ }
+}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalDatum.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalDatum.java
new file mode 100644
index 000000000..c92ef1773
--- /dev/null
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalDatum.java
@@ -0,0 +1,434 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2014 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.engine.recalibration;
+
+/*
+ * Copyright (c) 2009 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+import com.google.java.contract.Ensures;
+import com.google.java.contract.Invariant;
+import com.google.java.contract.Requires;
+import htsjdk.samtools.SAMUtils;
+import org.apache.commons.math.optimization.fitting.GaussianFunction;
+import org.broadinstitute.gatk.utils.MathUtils;
+import org.broadinstitute.gatk.utils.QualityUtils;
+
+
+/**
+ * An individual piece of recalibration data. Each bin counts up the number of observations and the number
+ * of reference mismatches seen for that combination of covariates.
+ *
+ * Created by IntelliJ IDEA.
+ * User: rpoplin
+ * Date: Nov 3, 2009
+ */
+@Invariant({
+ "estimatedQReported >= 0.0",
+ "! Double.isNaN(estimatedQReported)",
+ "! Double.isInfinite(estimatedQReported)",
+ "empiricalQuality >= 0.0 || empiricalQuality == UNINITIALIZED",
+ "! Double.isNaN(empiricalQuality)",
+ "! Double.isInfinite(empiricalQuality)",
+ "numObservations >= 0",
+ "numMismatches >= 0",
+ "numMismatches <= numObservations"
+})
+public class RecalDatum {
+ public final static byte MAX_RECALIBRATED_Q_SCORE = SAMUtils.MAX_PHRED_SCORE;
+ private static final double UNINITIALIZED = -1.0;
+
+ /**
+ * estimated reported quality score based on combined data's individual q-reporteds and number of observations
+ */
+ private double estimatedQReported;
+
+ /**
+ * the empirical quality for datums that have been collapsed together (by read group and reported quality, for example)
+ */
+ private double empiricalQuality;
+
+ /**
+ * number of bases seen in total
+ */
+ private long numObservations;
+
+ /**
+ * number of bases seen that didn't match the reference
+ */
+ private double numMismatches;
+
+ /**
+ * used when calculating empirical qualities to avoid division by zero
+ */
+ private static final int SMOOTHING_CONSTANT = 1;
+
+ //---------------------------------------------------------------------------------------------------------------
+ //
+ // constructors
+ //
+ //---------------------------------------------------------------------------------------------------------------
+
+ /**
+ * Create a new RecalDatum with given observation and mismatch counts, and an reported quality
+ *
+ * @param _numObservations observations
+ * @param _numMismatches mismatches
+ * @param reportedQuality Qreported
+ */
+ public RecalDatum(final long _numObservations, final double _numMismatches, final byte reportedQuality) {
+ if ( _numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0");
+ if ( _numMismatches < 0.0 ) throw new IllegalArgumentException("numMismatches < 0");
+ if ( reportedQuality < 0 ) throw new IllegalArgumentException("reportedQuality < 0");
+
+ numObservations = _numObservations;
+ numMismatches = _numMismatches;
+ estimatedQReported = reportedQuality;
+ empiricalQuality = UNINITIALIZED;
+ }
+
+ /**
+ * Copy copy into this recal datum, overwriting all of this objects data
+ * @param copy RecalDatum to copy
+ */
+ public RecalDatum(final RecalDatum copy) {
+ this.numObservations = copy.getNumObservations();
+ this.numMismatches = copy.getNumMismatches();
+ this.estimatedQReported = copy.estimatedQReported;
+ this.empiricalQuality = copy.empiricalQuality;
+ }
+
+ /**
+ * Add in all of the data from other into this object, updating the reported quality from the expected
+ * error rate implied by the two reported qualities
+ *
+ * @param other RecalDatum to combine
+ */
+ public synchronized void combine(final RecalDatum other) {
+ final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors();
+ increment(other.getNumObservations(), other.getNumMismatches());
+ estimatedQReported = -10 * Math.log10(sumErrors / getNumObservations());
+ empiricalQuality = UNINITIALIZED;
+ }
+
+ public synchronized void setEstimatedQReported(final double estimatedQReported) {
+ if ( estimatedQReported < 0 ) throw new IllegalArgumentException("estimatedQReported < 0");
+ if ( Double.isInfinite(estimatedQReported) ) throw new IllegalArgumentException("estimatedQReported is infinite");
+ if ( Double.isNaN(estimatedQReported) ) throw new IllegalArgumentException("estimatedQReported is NaN");
+
+ this.estimatedQReported = estimatedQReported;
+ empiricalQuality = UNINITIALIZED;
+ }
+
+ public final double getEstimatedQReported() {
+ return estimatedQReported;
+ }
+ public final byte getEstimatedQReportedAsByte() {
+ return (byte)(int)(Math.round(getEstimatedQReported()));
+ }
+
+ //---------------------------------------------------------------------------------------------------------------
+ //
+ // Empirical quality score -- derived from the num mismatches and observations
+ //
+ //---------------------------------------------------------------------------------------------------------------
+
+ /**
+ * Returns the error rate (in real space) of this interval, or 0 if there are no observations
+ * @return the empirical error rate ~= N errors / N obs
+ */
+ @Ensures({"result >= 0.0"})
+ public double getEmpiricalErrorRate() {
+ if ( numObservations == 0 )
+ return 0.0;
+ else {
+ // cache the value so we don't call log over and over again
+ final double doubleMismatches = numMismatches + SMOOTHING_CONSTANT;
+ // smoothing is one error and one non-error observation, for example
+ final double doubleObservations = numObservations + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT;
+ return doubleMismatches / doubleObservations;
+ }
+ }
+
+ public synchronized void setEmpiricalQuality(final double empiricalQuality) {
+ if ( empiricalQuality < 0 ) throw new IllegalArgumentException("empiricalQuality < 0");
+ if ( Double.isInfinite(empiricalQuality) ) throw new IllegalArgumentException("empiricalQuality is infinite");
+ if ( Double.isNaN(empiricalQuality) ) throw new IllegalArgumentException("empiricalQuality is NaN");
+
+ this.empiricalQuality = empiricalQuality;
+ }
+
+ public final double getEmpiricalQuality() {
+ return getEmpiricalQuality(getEstimatedQReported());
+ }
+
+ public synchronized final double getEmpiricalQuality(final double conditionalPrior) {
+ if (empiricalQuality == UNINITIALIZED) {
+ calcEmpiricalQuality(conditionalPrior);
+ }
+ return empiricalQuality;
+ }
+
+ public final byte getEmpiricalQualityAsByte() {
+ return (byte)(Math.round(getEmpiricalQuality()));
+ }
+
+ //---------------------------------------------------------------------------------------------------------------
+ //
+ // toString methods
+ //
+ //---------------------------------------------------------------------------------------------------------------
+
+ @Override
+ public String toString() {
+ return String.format("%d,%.2f,%.2f", getNumObservations(), getNumMismatches(), getEmpiricalQuality());
+ }
+
+ public String stringForCSV() {
+ return String.format("%s,%.2f,%.2f", toString(), getEstimatedQReported(), getEmpiricalQuality() - getEstimatedQReported());
+ }
+
+ //---------------------------------------------------------------------------------------------------------------
+ //
+ // increment methods
+ //
+ //---------------------------------------------------------------------------------------------------------------
+
+ public final long getNumObservations() {
+ return numObservations;
+ }
+
+ public final synchronized void setNumObservations(final long numObservations) {
+ if ( numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0");
+ this.numObservations = numObservations;
+ empiricalQuality = UNINITIALIZED;
+ }
+
+ public final double getNumMismatches() {
+ return numMismatches;
+ }
+
+ @Requires({"numMismatches >= 0"})
+ public final synchronized void setNumMismatches(final double numMismatches) {
+ if ( numMismatches < 0 ) throw new IllegalArgumentException("numMismatches < 0");
+ this.numMismatches = numMismatches;
+ empiricalQuality = UNINITIALIZED;
+ }
+
+ @Requires({"by >= 0"})
+ public final synchronized void incrementNumObservations(final long by) {
+ numObservations += by;
+ empiricalQuality = UNINITIALIZED;
+ }
+
+ @Requires({"by >= 0"})
+ public final synchronized void incrementNumMismatches(final double by) {
+ numMismatches += by;
+ empiricalQuality = UNINITIALIZED;
+ }
+
+ @Requires({"incObservations >= 0", "incMismatches >= 0"})
+ @Ensures({"numObservations == old(numObservations) + incObservations", "numMismatches == old(numMismatches) + incMismatches"})
+ public final synchronized void increment(final long incObservations, final double incMismatches) {
+ numObservations += incObservations;
+ numMismatches += incMismatches;
+ empiricalQuality = UNINITIALIZED;
+ }
+
+ @Ensures({"numObservations == old(numObservations) + 1", "numMismatches >= old(numMismatches)"})
+ public final synchronized void increment(final boolean isError) {
+ increment(1, isError ? 1.0 : 0.0);
+ }
+
+ // -------------------------------------------------------------------------------------
+ //
+ // Private implementation helper functions
+ //
+ // -------------------------------------------------------------------------------------
+
+ /**
+ * calculate the expected number of errors given the estimated Q reported and the number of observations
+ * in this datum.
+ *
+ * @return a positive (potentially fractional) estimate of the number of errors
+ */
+ @Ensures("result >= 0.0")
+ private double calcExpectedErrors() {
+ return getNumObservations() * QualityUtils.qualToErrorProb(estimatedQReported);
+ }
+
+ /**
+ * Calculate and cache the empirical quality score from mismatches and observations (expensive operation)
+ */
+ @Requires("empiricalQuality == UNINITIALIZED")
+ @Ensures("empiricalQuality != UNINITIALIZED")
+ private synchronized void calcEmpiricalQuality(final double conditionalPrior) {
+
+ // smoothing is one error and one non-error observation
+ final long mismatches = (long)(getNumMismatches() + 0.5) + SMOOTHING_CONSTANT;
+ final long observations = getNumObservations() + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT;
+
+ final double empiricalQual = RecalDatum.bayesianEstimateOfEmpiricalQuality(observations, mismatches, conditionalPrior);
+
+ // This is the old and busted point estimate approach:
+ //final double empiricalQual = -10 * Math.log10(getEmpiricalErrorRate());
+
+ empiricalQuality = Math.min(empiricalQual, (double) MAX_RECALIBRATED_Q_SCORE);
+ }
+
+ //static final boolean DEBUG = false;
+ static private final double RESOLUTION_BINS_PER_QUAL = 1.0;
+
+ static public double bayesianEstimateOfEmpiricalQuality(final long nObservations, final long nErrors, final double QReported) {
+
+ final int numBins = (QualityUtils.MAX_REASONABLE_Q_SCORE + 1) * (int)RESOLUTION_BINS_PER_QUAL;
+
+ final double[] log10Posteriors = new double[numBins];
+
+ for ( int bin = 0; bin < numBins; bin++ ) {
+
+ final double QEmpOfBin = bin / RESOLUTION_BINS_PER_QUAL;
+
+ log10Posteriors[bin] = log10QempPrior(QEmpOfBin, QReported) + log10QempLikelihood(QEmpOfBin, nObservations, nErrors);
+
+ //if ( DEBUG )
+ // System.out.println(String.format("bin = %d, Qreported = %f, nObservations = %f, nErrors = %f, posteriors = %f", bin, QReported, nObservations, nErrors, log10Posteriors[bin]));
+ }
+
+ //if ( DEBUG )
+ // System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f", QReported, nObservations, nErrors));
+
+ final double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10Posteriors);
+ final int MLEbin = MathUtils.maxElementIndex(normalizedPosteriors);
+
+ final double Qemp = MLEbin / RESOLUTION_BINS_PER_QUAL;
+ return Qemp;
+ }
+
+ /**
+ * Quals above this value should be capped down to this value (because they are too high)
+ * in the base quality score recalibrator
+ */
+ public final static byte MAX_GATK_USABLE_Q_SCORE = 40;
+ static private final double[] log10QempPriorCache = new double[MAX_GATK_USABLE_Q_SCORE + 1];
+ static {
+ // f(x) = a + b*exp(-((x - c)^2 / (2*d^2)))
+ // Note that b is the height of the curve's peak, c is the position of the center of the peak, and d controls the width of the "bell".
+ final double GF_a = 0.0;
+ final double GF_b = 0.9;
+ final double GF_c = 0.0;
+ final double GF_d = 0.5; // with these parameters, deltas can shift at most ~20 Q points
+
+ final GaussianFunction gaussian = new GaussianFunction(GF_a, GF_b, GF_c, GF_d);
+ for ( int i = 0; i <= MAX_GATK_USABLE_Q_SCORE; i++ ) {
+ double log10Prior = Math.log10(gaussian.value((double) i));
+ if ( Double.isInfinite(log10Prior) )
+ log10Prior = -Double.MAX_VALUE;
+ log10QempPriorCache[i] = log10Prior;
+ }
+ }
+
+ static protected double log10QempPrior(final double Qempirical, final double Qreported) {
+ final int difference = Math.min(Math.abs((int) (Qempirical - Qreported)), MAX_GATK_USABLE_Q_SCORE);
+ //if ( DEBUG )
+ // System.out.println(String.format("Qemp = %f, log10Priors = %f", Qempirical, log10QempPriorCache[difference]));
+ return log10QempPriorCache[difference];
+ }
+
+ static private final long MAX_NUMBER_OF_OBSERVATIONS = Integer.MAX_VALUE - 1;
+
+ static protected double log10QempLikelihood(final double Qempirical, long nObservations, long nErrors) {
+ if ( nObservations == 0 )
+ return 0.0;
+
+ // the binomial code requires ints as input (because it does caching). This should theoretically be fine because
+ // there is plenty of precision in 2^31 observations, but we need to make sure that we don't have overflow
+ // before casting down to an int.
+ if ( nObservations > MAX_NUMBER_OF_OBSERVATIONS ) {
+ // we need to decrease nErrors by the same fraction that we are decreasing nObservations
+ final double fraction = (double)MAX_NUMBER_OF_OBSERVATIONS / (double)nObservations;
+ nErrors = Math.round((double)nErrors * fraction);
+ nObservations = MAX_NUMBER_OF_OBSERVATIONS;
+ }
+
+ // this is just a straight binomial PDF
+ double log10Prob = MathUtils.log10BinomialProbability((int)nObservations, (int)nErrors, QualityUtils.qualToErrorProbLog10(Qempirical));
+ if ( Double.isInfinite(log10Prob) || Double.isNaN(log10Prob) )
+ log10Prob = -Double.MAX_VALUE;
+
+ //if ( DEBUG )
+ // System.out.println(String.format("Qemp = %f, log10Likelihood = %f", Qempirical, log10Prob));
+
+ return log10Prob;
+ }
+}
\ No newline at end of file
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalDatumNode.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalDatumNode.java
new file mode 100644
index 000000000..14b4c762b
--- /dev/null
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalDatumNode.java
@@ -0,0 +1,582 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2014 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.engine.recalibration;
+
+import com.google.java.contract.Ensures;
+import com.google.java.contract.Requires;
+import org.apache.commons.math.MathException;
+import org.apache.commons.math.stat.inference.ChiSquareTestImpl;
+import org.apache.log4j.Logger;
+import org.broadinstitute.gatk.utils.collections.Pair;
+import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
+
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.Set;
+
+/**
+ * A tree of recal datum, where each contains a set of sub datum representing sub-states of the higher level one
+ *
+ * @author Mark DePristo
+ * @since 07/27/12
+ */
+public class RecalDatumNode {
+ private final static double SMALLEST_CHI2_PVALUE = 1e-300;
+ protected static final Logger logger = Logger.getLogger(RecalDatumNode.class);
+
+ /**
+ * fixedPenalty is this value if it's considered fixed
+ */
+ private final static double UNINITIALIZED = Double.NEGATIVE_INFINITY;
+
+ private final T recalDatum;
+ private double fixedPenalty = UNINITIALIZED;
+ private final Set> subnodes;
+
+ @Requires({"recalDatum != null"})
+ public RecalDatumNode(final T recalDatum) {
+ this(recalDatum, new HashSet>());
+ }
+
+ @Override
+ public String toString() {
+ return recalDatum.toString();
+ }
+
+ @Requires({"recalDatum != null", "subnodes != null"})
+ public RecalDatumNode(final T recalDatum, final Set> subnodes) {
+ this(recalDatum, UNINITIALIZED, subnodes);
+ }
+
+ @Requires({"recalDatum != null"})
+ protected RecalDatumNode(final T recalDatum, final double fixedPenalty) {
+ this(recalDatum, fixedPenalty, new HashSet>());
+ }
+
+ @Requires({"recalDatum != null", "subnodes != null"})
+ protected RecalDatumNode(final T recalDatum, final double fixedPenalty, final Set> subnodes) {
+ this.recalDatum = recalDatum;
+ this.fixedPenalty = fixedPenalty;
+ this.subnodes = new HashSet>(subnodes);
+ }
+
+ /**
+ * Get the recal data associated with this node
+ * @return
+ */
+ @Ensures("result != null")
+ public T getRecalDatum() {
+ return recalDatum;
+ }
+
+ /**
+ * The set of all subnodes of this tree. May be modified.
+ * @return
+ */
+ @Ensures("result != null")
+ public Set> getSubnodes() {
+ return subnodes;
+ }
+
+ /**
+ * Return the fixed penalty, if set, or else the the calculated penalty for this node
+ * @return
+ */
+ public double getPenalty() {
+ if ( fixedPenalty != UNINITIALIZED )
+ return fixedPenalty;
+ else
+ return calcPenalty();
+ }
+
+ /**
+ * Set the fixed penalty for this node to a fresh calculation from calcPenalty
+ *
+ * This is important in the case where you want to compute the penalty from a full
+ * tree and then chop the tree up afterwards while considering the previous penalties.
+ * If you don't call this function then manipulating the tree may result in the
+ * penalty functions changing with changes in the tree.
+ *
+ * @param doEntireTree recurse into all subnodes?
+ * @return the fixed penalty for this node
+ */
+ public double calcAndSetFixedPenalty(final boolean doEntireTree) {
+ fixedPenalty = calcPenalty();
+ if ( doEntireTree )
+ for ( final RecalDatumNode sub : subnodes )
+ sub.calcAndSetFixedPenalty(doEntireTree);
+ return fixedPenalty;
+ }
+
+ /**
+ * Add node to the set of subnodes of this node
+ * @param sub
+ */
+ @Requires("sub != null")
+ public void addSubnode(final RecalDatumNode sub) {
+ subnodes.add(sub);
+ }
+
+ /**
+ * Is this a leaf node (i.e., has no subnodes)?
+ * @return
+ */
+ public boolean isLeaf() {
+ return subnodes.isEmpty();
+ }
+
+ /**
+ * Is this node immediately above only leaf nodes?
+ *
+ * @return
+ */
+ public boolean isAboveOnlyLeaves() {
+ for ( final RecalDatumNode sub : subnodes )
+ if ( ! sub.isLeaf() )
+ return false;
+ return true;
+ }
+
+ /**
+ * What's the immediate number of subnodes from this node?
+ * @return
+ */
+ @Ensures("result >= 0")
+ public int getNumSubnodes() {
+ return subnodes.size();
+ }
+
+ /**
+ * Total penalty is the sum of leaf node penalties
+ *
+ * This algorithm assumes that penalties have been fixed before pruning, as leaf nodes by
+ * definition have 0 penalty unless they represent a pruned tree with underlying -- but now
+ * pruned -- subtrees
+ *
+ * @return
+ */
+ public double totalPenalty() {
+ if ( isLeaf() )
+ return getPenalty();
+ else {
+ double sum = 0.0;
+ for ( final RecalDatumNode sub : subnodes )
+ sum += sub.totalPenalty();
+ return sum;
+ }
+ }
+
+ /**
+ * The maximum penalty among all nodes
+ * @return
+ */
+ public double maxPenalty(final boolean leafOnly) {
+ double max = ! leafOnly || isLeaf() ? getPenalty() : Double.MIN_VALUE;
+ for ( final RecalDatumNode sub : subnodes )
+ max = Math.max(max, sub.maxPenalty(leafOnly));
+ return max;
+ }
+
+ /**
+ * The minimum penalty among all nodes
+ * @return
+ */
+ public double minPenalty(final boolean leafOnly) {
+ double min = ! leafOnly || isLeaf() ? getPenalty() : Double.MAX_VALUE;
+ for ( final RecalDatumNode sub : subnodes )
+ min = Math.min(min, sub.minPenalty(leafOnly));
+ return min;
+ }
+
+ /**
+ * What's the longest branch from this node to any leaf?
+ * @return
+ */
+ public int maxDepth() {
+ int subMax = 0;
+ for ( final RecalDatumNode sub : subnodes )
+ subMax = Math.max(subMax, sub.maxDepth());
+ return subMax + 1;
+ }
+
+ /**
+ * What's the shortest branch from this node to any leaf? Includes this node
+ * @return
+ */
+ @Ensures("result > 0")
+ public int minDepth() {
+ if ( isLeaf() )
+ return 1;
+ else {
+ int subMin = Integer.MAX_VALUE;
+ for ( final RecalDatumNode sub : subnodes )
+ subMin = Math.min(subMin, sub.minDepth());
+ return subMin + 1;
+ }
+ }
+
+ /**
+ * Return the number of nodes, including this one, reachable from this node
+ * @return
+ */
+ @Ensures("result > 0")
+ public int size() {
+ int size = 1;
+ for ( final RecalDatumNode sub : subnodes )
+ size += sub.size();
+ return size;
+ }
+
+ /**
+ * Count the number of leaf nodes reachable from this node
+ *
+ * @return
+ */
+ @Ensures("result >= 0")
+ public int numLeaves() {
+ if ( isLeaf() )
+ return 1;
+ else {
+ int size = 0;
+ for ( final RecalDatumNode sub : subnodes )
+ size += sub.numLeaves();
+ return size;
+ }
+ }
+
+ /**
+ * Calculate the phred-scaled p-value for a chi^2 test for independent among subnodes of this node.
+ *
+ * The chi^2 value indicates the degree of independence of the implied error rates among the
+ * immediate subnodes
+ *
+ * @return the phred-scaled p-value for chi2 penalty, or 0.0 if it cannot be calculated
+ */
+ private double calcPenalty() {
+ if ( isLeaf() || freeToMerge() )
+ return 0.0;
+ else if ( subnodes.size() == 1 )
+ // only one value, so its free to merge away
+ return 0.0;
+ else {
+ final long[][] counts = new long[subnodes.size()][2];
+
+ int i = 0;
+ for ( final RecalDatumNode subnode : subnodes ) {
+ // use the yates correction to help avoid all zeros => NaN
+ counts[i][0] = Math.round(subnode.getRecalDatum().getNumMismatches()) + 1L;
+ counts[i][1] = subnode.getRecalDatum().getNumObservations() + 2L;
+ i++;
+ }
+
+ try {
+ final double chi2PValue = new ChiSquareTestImpl().chiSquareTest(counts);
+ final double penalty = -10.0 * Math.log10(Math.max(chi2PValue, SMALLEST_CHI2_PVALUE));
+
+ // make sure things are reasonable and fail early if not
+ if (Double.isInfinite(penalty) || Double.isNaN(penalty))
+ throw new ReviewedGATKException("chi2 value is " + chi2PValue + " at " + getRecalDatum());
+
+ return penalty;
+ } catch ( MathException e ) {
+ throw new ReviewedGATKException("Failed in calculating chi2 value", e);
+ }
+ }
+ }
+
+ /**
+ * Is this node free to merge because its rounded Q score is the same as all nodes below
+ * @return
+ */
+ private boolean freeToMerge() {
+ if ( isLeaf() ) // leaves are free to merge
+ return true;
+ else {
+ final byte myQual = getRecalDatum().getEmpiricalQualityAsByte();
+ for ( final RecalDatumNode sub : subnodes )
+ if ( sub.getRecalDatum().getEmpiricalQualityAsByte() != myQual )
+ return false;
+ return true;
+ }
+ }
+
+ /**
+ * Calculate the penalty of this interval, given the overall error rate for the interval
+ *
+ * If the globalErrorRate is e, this value is:
+ *
+ * sum_i |log10(e_i) - log10(e)| * nObservations_i
+ *
+ * each the index i applies to all leaves of the tree accessible from this interval
+ * (found recursively from subnodes as necessary)
+ *
+ * @param globalErrorRate overall error rate in real space against which we calculate the penalty
+ * @return the cost of approximating the bins in this interval with the globalErrorRate
+ */
+ @Requires("globalErrorRate >= 0.0")
+ @Ensures("result >= 0.0")
+ private double calcPenaltyLog10(final double globalErrorRate) {
+ if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty
+ return 0.0;
+
+ if ( isLeaf() ) {
+ // this is leave node
+ return (Math.abs(Math.log10(recalDatum.getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * (double)recalDatum.getNumObservations();
+ // TODO -- how we can generalize this calculation?
+// if ( this.qEnd <= minInterestingQual )
+// // It's free to merge up quality scores below the smallest interesting one
+// return 0;
+// else {
+// return (Math.abs(Math.log10(getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * getNumObservations();
+// }
+ } else {
+ double sum = 0;
+ for ( final RecalDatumNode hrd : subnodes)
+ sum += hrd.calcPenaltyLog10(globalErrorRate);
+ return sum;
+ }
+ }
+
+ /**
+ * Return a freshly allocated tree prunes to have no more than maxDepth from the root to any leaf
+ *
+ * @param maxDepth
+ * @return
+ */
+ public RecalDatumNode pruneToDepth(final int maxDepth) {
+ if ( maxDepth < 1 )
+ throw new IllegalArgumentException("maxDepth < 1");
+ else {
+ final Set> subPruned = new HashSet>(getNumSubnodes());
+ if ( maxDepth > 1 )
+ for ( final RecalDatumNode sub : subnodes )
+ subPruned.add(sub.pruneToDepth(maxDepth - 1));
+ return new RecalDatumNode(getRecalDatum(), fixedPenalty, subPruned);
+ }
+ }
+
+ /**
+ * Return a freshly allocated tree with to no more than maxElements in order of penalty
+ *
+ * Note that nodes must have fixed penalties to this algorithm will fail.
+ *
+ * @param maxElements
+ * @return
+ */
+ public RecalDatumNode pruneByPenalty(final int maxElements) {
+ RecalDatumNode root = this;
+
+ while ( root.size() > maxElements ) {
+ // remove the lowest penalty element, and continue
+ root = root.removeLowestPenaltyNode();
+ }
+
+ // our size is below the target, so we are good, return
+ return root;
+ }
+
+ /**
+ * Return a freshly allocated tree where all mergable nodes with < maxPenalty are merged
+ *
+ * Note that nodes must have fixed penalties to this algorithm will fail.
+ *
+ * @param maxPenaltyIn the maximum penalty we are allowed to incur for a merge
+ * @param applyBonferroniCorrection if true, we will adjust penalty by the phred-scaled bonferroni correction
+ * for the size of the initial tree. That is, if there are 10 nodes in the
+ * tree and maxPenalty is 20 we will actually enforce 10^-2 / 10 = 10^-3 = 30
+ * penalty for multiple testing
+ * @return
+ */
+ public RecalDatumNode pruneToNoMoreThanPenalty(final double maxPenaltyIn, final boolean applyBonferroniCorrection) {
+ RecalDatumNode root = this;
+
+ final double bonferroniCorrection = 10 * Math.log10(this.size());
+ final double maxPenalty = applyBonferroniCorrection ? maxPenaltyIn + bonferroniCorrection : maxPenaltyIn;
+
+ if ( applyBonferroniCorrection )
+ logger.info(String.format("Applying Bonferroni correction for %d nodes = %.2f to initial penalty %.2f for total " +
+ "corrected max penalty of %.2f", this.size(), bonferroniCorrection, maxPenaltyIn, maxPenalty));
+
+ while ( true ) {
+ final Pair, Double> minPenaltyNode = root.getMinPenaltyAboveLeafNode();
+
+ if ( minPenaltyNode == null || minPenaltyNode.getSecond() > maxPenalty ) {
+ // nothing to merge, or the best candidate is above our max allowed
+ if ( minPenaltyNode == null ) {
+ if ( logger.isDebugEnabled() ) logger.debug("Stopping because no candidates could be found");
+ } else {
+ if ( logger.isDebugEnabled() ) logger.debug("Stopping because node " + minPenaltyNode.getFirst() + " has penalty " + minPenaltyNode.getSecond() + " > max " + maxPenalty);
+ }
+ break;
+ } else {
+ // remove the lowest penalty element, and continue
+ if ( logger.isDebugEnabled() ) logger.debug("Removing node " + minPenaltyNode.getFirst() + " with penalty " + minPenaltyNode.getSecond());
+ root = root.removeLowestPenaltyNode();
+ }
+ }
+
+ // no more candidates exist with penalty < maxPenalty
+ return root;
+ }
+
+
+ /**
+ * Find the lowest penalty above leaf node in the tree, and return a tree without it
+ *
+ * Note this excludes the current (root) node
+ *
+ * @return
+ */
+ private RecalDatumNode removeLowestPenaltyNode() {
+ final Pair, Double> nodeToRemove = getMinPenaltyAboveLeafNode();
+ if ( logger.isDebugEnabled() )
+ logger.debug("Removing " + nodeToRemove.getFirst() + " with penalty " + nodeToRemove.getSecond());
+
+ final Pair, Boolean> result = removeNode(nodeToRemove.getFirst());
+
+ if ( ! result.getSecond() )
+ throw new IllegalStateException("Never removed any node!");
+
+ final RecalDatumNode oneRemoved = result.getFirst();
+ if ( oneRemoved == null )
+ throw new IllegalStateException("Removed our root node, wow, didn't expect that");
+ return oneRemoved;
+ }
+
+ /**
+ * Finds in the tree the node with the lowest penalty whose subnodes are all leaves
+ *
+ * @return the node and its penalty, or null if no such node exists
+ */
+ private Pair, Double> getMinPenaltyAboveLeafNode() {
+ if ( isLeaf() )
+ // not allowed to remove leafs directly
+ return null;
+ if ( isAboveOnlyLeaves() )
+ // we only consider removing nodes above all leaves
+ return new Pair, Double>(this, getPenalty());
+ else {
+ // just recurse, taking the result with the min penalty of all subnodes
+ Pair, Double> minNode = null;
+ for ( final RecalDatumNode sub : subnodes ) {
+ final Pair, Double> subFind = sub.getMinPenaltyAboveLeafNode();
+ if ( subFind != null && (minNode == null || subFind.getSecond() < minNode.getSecond()) ) {
+ minNode = subFind;
+ }
+ }
+ return minNode;
+ }
+ }
+
+ /**
+ * Return a freshly allocated tree without the node nodeToRemove
+ *
+ * @param nodeToRemove
+ * @return
+ */
+ private Pair, Boolean> removeNode(final RecalDatumNode nodeToRemove) {
+ if ( this == nodeToRemove ) {
+ if ( isLeaf() )
+ throw new IllegalStateException("Trying to remove a leaf node from the tree! " + this + " " + nodeToRemove);
+ // node is the thing we are going to remove, but without any subnodes
+ final RecalDatumNode node = new RecalDatumNode(getRecalDatum(), fixedPenalty);
+ return new Pair, Boolean>(node, true);
+ } else {
+ // did we remove something in a sub branch?
+ boolean removedSomething = false;
+
+ // our sub nodes with the penalty node removed
+ final Set> sub = new HashSet>(getNumSubnodes());
+
+ for ( final RecalDatumNode sub1 : subnodes ) {
+ if ( removedSomething ) {
+ // already removed something, just add sub1 back to sub
+ sub.add(sub1);
+ } else {
+ // haven't removed anything yet, so try
+ final Pair, Boolean> maybeRemoved = sub1.removeNode(nodeToRemove);
+ removedSomething = maybeRemoved.getSecond();
+ sub.add(maybeRemoved.getFirst());
+ }
+ }
+
+ final RecalDatumNode node = new RecalDatumNode(getRecalDatum(), fixedPenalty, sub);
+ return new Pair, Boolean>(node, removedSomething);
+ }
+ }
+
+ /**
+ * Return a collection of all of the data in the leaf nodes of this tree
+ *
+ * @return
+ */
+ public Collection getAllLeaves() {
+ final LinkedList list = new LinkedList();
+ getAllLeavesRec(list);
+ return list;
+ }
+
+ /**
+ * Helpful recursive function for getAllLeaves()
+ *
+ * @param list the destination for the list of leaves
+ */
+ private void getAllLeavesRec(final LinkedList list) {
+ if ( isLeaf() )
+ list.add(getRecalDatum());
+ else {
+ for ( final RecalDatumNode sub : subnodes )
+ sub.getAllLeavesRec(list);
+ }
+ }
+}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalUtils.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalUtils.java
new file mode 100644
index 000000000..f2f33ee59
--- /dev/null
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalUtils.java
@@ -0,0 +1,1097 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2014 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.engine.recalibration;
+
+import com.google.java.contract.Ensures;
+import com.google.java.contract.Requires;
+import org.apache.log4j.Logger;
+import org.broadinstitute.gatk.engine.recalibration.covariates.*;
+import org.broadinstitute.gatk.utils.report.GATKReport;
+import org.broadinstitute.gatk.utils.report.GATKReportTable;
+import org.broadinstitute.gatk.utils.classloader.JVMUtils;
+import org.broadinstitute.gatk.utils.recalibration.EventType;
+import org.broadinstitute.gatk.utils.BaseUtils;
+import org.broadinstitute.gatk.utils.R.RScriptExecutor;
+import org.broadinstitute.gatk.utils.Utils;
+import org.broadinstitute.gatk.utils.classloader.PluginManager;
+import org.broadinstitute.gatk.utils.collections.NestedIntegerArray;
+import org.broadinstitute.gatk.utils.collections.Pair;
+import org.broadinstitute.gatk.utils.exceptions.DynamicClassResolutionException;
+import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
+import org.broadinstitute.gatk.utils.exceptions.UserException;
+import org.broadinstitute.gatk.utils.io.Resource;
+import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord;
+import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
+import org.broadinstitute.gatk.utils.sam.ReadUtils;
+
+import java.io.*;
+import java.util.*;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: rpoplin
+ * Date: Nov 6, 2009
+ *
+ * This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions.
+ * It also has static methods that are used to perform the various solid recalibration modes that attempt to correct the reference bias.
+ * This class holds the parsing methods that are shared between BaseRecalibrator and PrintReads.
+ */
+
+public class RecalUtils {
+ public final static String ARGUMENT_REPORT_TABLE_TITLE = "Arguments";
+ public final static String QUANTIZED_REPORT_TABLE_TITLE = "Quantized";
+ public final static String READGROUP_REPORT_TABLE_TITLE = "RecalTable0";
+ public final static String QUALITY_SCORE_REPORT_TABLE_TITLE = "RecalTable1";
+ public final static String ALL_COVARIATES_REPORT_TABLE_TITLE = "RecalTable2";
+
+ public final static String ARGUMENT_COLUMN_NAME = "Argument";
+ public final static String ARGUMENT_VALUE_COLUMN_NAME = "Value";
+ public final static String QUANTIZED_VALUE_COLUMN_NAME = "QuantizedScore";
+ public static final String QUANTIZED_COUNT_COLUMN_NAME = "Count";
+ public final static String READGROUP_COLUMN_NAME = "ReadGroup";
+ public final static String EVENT_TYPE_COLUMN_NAME = "EventType";
+ public final static String EMPIRICAL_QUALITY_COLUMN_NAME = "EmpiricalQuality";
+ public final static String ESTIMATED_Q_REPORTED_COLUMN_NAME = "EstimatedQReported";
+ public final static String QUALITY_SCORE_COLUMN_NAME = "QualityScore";
+ public final static String COVARIATE_VALUE_COLUMN_NAME = "CovariateValue";
+ public final static String COVARIATE_NAME_COLUMN_NAME = "CovariateName";
+ public final static String NUMBER_OBSERVATIONS_COLUMN_NAME = "Observations";
+ public final static String NUMBER_ERRORS_COLUMN_NAME = "Errors";
+
+ private final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
+ private final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
+ private static boolean warnUserNullPlatform = false;
+
+ private static final String SCRIPT_FILE = "BQSR.R";
+
+ private static final Pair covariateValue = new Pair(RecalUtils.COVARIATE_VALUE_COLUMN_NAME, "%s");
+ private static final Pair covariateName = new Pair(RecalUtils.COVARIATE_NAME_COLUMN_NAME, "%s");
+ private static final Pair eventType = new Pair(RecalUtils.EVENT_TYPE_COLUMN_NAME, "%s");
+ private static final Pair empiricalQuality = new Pair(RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, "%.4f");
+ private static final Pair estimatedQReported = new Pair(RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.4f");
+ private static final Pair nObservations = new Pair(RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d");
+ private static final Pair nErrors = new Pair(RecalUtils.NUMBER_ERRORS_COLUMN_NAME, "%.2f");
+
+ /**
+ * Generates two lists : required covariates and optional covariates based on the user's requests.
+ *
+ * Performs the following tasks in order:
+ * 1. Adds all requierd covariates in order
+ * 2. Check if the user asked to use the standard covariates and adds them all if that's the case
+ * 3. Adds all covariates requested by the user that were not already added by the two previous steps
+ *
+ * @param argumentCollection the argument collection object for the recalibration walker
+ * @return a pair of ordered lists : required covariates (first) and optional covariates (second)
+ */
+ public static Pair, ArrayList> initializeCovariates(RecalibrationArgumentCollection argumentCollection) {
+ final List> covariateClasses = new PluginManager(Covariate.class).getPlugins();
+ final List> requiredClasses = new PluginManager(RequiredCovariate.class).getPlugins();
+ final List> standardClasses = new PluginManager(StandardCovariate.class).getPlugins();
+
+ final ArrayList requiredCovariates = addRequiredCovariatesToList(requiredClasses); // add the required covariates
+ ArrayList optionalCovariates = new ArrayList();
+ if (!argumentCollection.DO_NOT_USE_STANDARD_COVARIATES)
+ optionalCovariates = addStandardCovariatesToList(standardClasses); // add the standard covariates if -standard was specified by the user
+
+ // parse the -cov arguments that were provided, skipping over the ones already specified
+ if (argumentCollection.COVARIATES != null) {
+ for (String requestedCovariateString : argumentCollection.COVARIATES) {
+ // help the transition from BQSR v1 to BQSR v2
+ if ( requestedCovariateString.equals("DinucCovariate") )
+ throw new UserException.CommandLineException("DinucCovariate has been retired. Please use its successor covariate " +
+ "ContextCovariate instead, which includes the 2 bp (dinuc) substitution model of the retired DinucCovariate " +
+ "as well as an indel context to model the indel error rates");
+
+ boolean foundClass = false;
+ for (Class extends Covariate> covClass : covariateClasses) {
+ if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class
+ foundClass = true;
+ if (!requiredClasses.contains(covClass) &&
+ (argumentCollection.DO_NOT_USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) {
+ try {
+ final Covariate covariate = covClass.newInstance(); // now that we've found a matching class, try to instantiate it
+ optionalCovariates.add(covariate);
+ } catch (Exception e) {
+ throw new DynamicClassResolutionException(covClass, e);
+ }
+ }
+ }
+ }
+
+ if (!foundClass) {
+ throw new UserException.CommandLineException("The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates.");
+ }
+ }
+ }
+ return new Pair, ArrayList>(requiredCovariates, optionalCovariates);
+ }
+
+ /**
+ * Adds the required covariates to a covariate list
+ *
+ * Note: this method really only checks if the classes object has the expected number of required covariates, then add them by hand.
+ *
+ * @param classes list of classes to add to the covariate list
+ * @return the covariate list
+ */
+ private static ArrayList addRequiredCovariatesToList(List> classes) {
+ ArrayList dest = new ArrayList(classes.size());
+ if (classes.size() != 2)
+ throw new ReviewedGATKException("The number of required covariates has changed, this is a hard change in the code and needs to be inspected");
+
+ dest.add(new ReadGroupCovariate()); // enforce the order with RG first and QS next.
+ dest.add(new QualityScoreCovariate());
+ return dest;
+ }
+
+ /**
+ * Adds the standard covariates to a covariate list
+ *
+ * @param classes list of classes to add to the covariate list
+ * @return the covariate list
+ */
+ private static ArrayList addStandardCovariatesToList(List> classes) {
+ ArrayList dest = new ArrayList(classes.size());
+ for (Class> covClass : classes) {
+ try {
+ final Covariate covariate = (Covariate) covClass.newInstance();
+ dest.add(covariate);
+ } catch (Exception e) {
+ throw new DynamicClassResolutionException(covClass, e);
+ }
+ }
+ return dest;
+ }
+
+ /**
+ * Print a list of all available covariates to logger as info
+ *
+ * @param logger
+ */
+ public static void listAvailableCovariates(final Logger logger) {
+ logger.info("Available covariates:");
+ for (final Class extends Covariate> covClass : new PluginManager(Covariate.class).getPlugins()) {
+ logger.info(String.format("\t%30s\t%s", covClass.getSimpleName(), JVMUtils.classInterfaces(covClass)));
+ }
+ }
+
+ /**
+ * Component used to print out csv representation of the reports that can be use to perform analysis in
+ * external tools. E.g. generate plots using R scripts.
+ *
+ * A header is always printed into the output stream (or file) when the printer is created. Then you only need
+ * to call {@link #print(RecalibrationReport,String) print} for each report you want to include in the csv file.
+ * Once finished, you close the printer calling {@link #close() close}
+ *
+ */
+ private static class CsvPrinter {
+
+ private final PrintStream ps;
+ private final Covariate[] covariates;
+
+ /**
+ * Constructs a printer redirected to an output file.
+ * @param out the output file.
+ * @param c covariates to print out.
+ * @throws FileNotFoundException if the file could not be created anew.
+ */
+ protected CsvPrinter(final File out, final Covariate ... c)
+ throws FileNotFoundException {
+ this(new FileOutputStream(out), c);
+ }
+
+ /**
+ * Constructs a printer redirected to an output stream
+ * @param os the output.
+ * @param c covariates to print out.
+ */
+ protected CsvPrinter(final OutputStream os, final Covariate ... c) {
+ covariates = c == null ? new Covariate[0] : c.clone();
+ ps = new PrintStream(os);
+ printHeader();
+ }
+
+ /**
+ * Prints the header out.
+ *
+ * Should only be invoked at creation.
+ */
+ protected void printHeader() {
+ RecalUtils.printHeader(ps);
+ }
+
+ /**
+ * Prints out a report into the csv file.
+ *
+ *
+ * @param report the report to print out.
+ * @param mode the report associated mode. (typically ORIGINAL, RECALIBRATED
+ */
+ public void print(final RecalibrationReport report, final String mode) {
+ RecalUtils.writeCSV(ps,report.getRecalibrationTables(),mode,covariates,false);
+ }
+
+ /**
+ * Close the csv printer.
+ *
+ * No further output will be allowed or take place after calling this method.
+ */
+ public void close() {
+ ps.close();
+ }
+
+ }
+
+ /**
+ * Returns a csv output printer.
+ *
+ * @param out the output file. It will be overridden
+ * @param c list of covariates to print out.
+ *
+ * @throws FileNotFoundException if out could not be created anew.
+ *
+ * @return never null
+ */
+ protected static CsvPrinter csvPrinter(final File out, final Covariate ... c)
+ throws FileNotFoundException
+ {
+ if (c == null) {
+ throw new IllegalArgumentException("the input covariate array cannot be null");
+ }
+ return new CsvPrinter(out,c);
+ }
+
+ /**
+ * Prints out a collection of reports into a file in Csv format in a way
+ * that can be used by R scripts (such as the plot generator script).
+ *
+ * The set of covariates is take as the minimum common set from all reports.
+ *
+ * @param out the output file. It will be overridden.
+ * @param reports map where keys are the unique 'mode' (ORIGINAL, RECALIBRATED, ...)
+ * of each report and the corresponding value the report itself.
+ * @throws FileNotFoundException if out could not be created anew.
+ */
+ public static void generateCsv(final File out, final Map reports)
+ throws FileNotFoundException {
+ if (reports.size() == 0) {
+ writeCsv(out, reports, new Covariate[0]);
+ } else {
+ final Iterator rit = reports.values().iterator();
+ final RecalibrationReport first = rit.next();
+ final Covariate[] firstCovariates = first.getRequestedCovariates();
+ final Set covariates = new LinkedHashSet<>();
+ Utils.addAll(covariates,firstCovariates);
+ while (rit.hasNext() && covariates.size() > 0) {
+ final Covariate[] nextCovariates = rit.next().getRequestedCovariates();
+ final Set nextCovariateNames = new LinkedHashSet(nextCovariates.length);
+ for (final Covariate nc : nextCovariates) {
+ nextCovariateNames.add(nc.getClass().getSimpleName());
+ }
+ final Iterator cit = covariates.iterator();
+ while (cit.hasNext()) {
+ if (!nextCovariateNames.contains(cit.next().getClass().getSimpleName())) {
+ cit.remove();
+ }
+ }
+ }
+ writeCsv(out, reports, covariates.toArray(new Covariate[covariates.size()]));
+ }
+ }
+
+ /**
+ * Print out a collection of reports into a file in Csv format in a way
+ * that can be used by R scripts (such as the plot generator script).
+ *
+ * @param out
+ * @param reports map where keys are the unique 'mode' (ORIGINAL, RECALIBRATED, ...)
+ * of each report and the corresponding value the report itself.
+ * @param c the covariates to print out.
+ * @throws FileNotFoundException if out could not be created anew.
+ */
+ private static void writeCsv(final File out,
+ final Map reports, final Covariate[] c)
+ throws FileNotFoundException {
+ final CsvPrinter p = csvPrinter(out,c);
+ for (Map.Entry e : reports.entrySet()) {
+ p.print(e.getValue(),e.getKey());
+ }
+ p.close();
+ }
+
+ public enum SOLID_RECAL_MODE {
+ /**
+ * Treat reference inserted bases as reference matching bases. Very unsafe!
+ */
+ DO_NOTHING,
+ /**
+ * Set reference inserted bases and the previous base (because of color space alignment details) to Q0. This is the default option.
+ */
+ SET_Q_ZERO,
+ /**
+ * In addition to setting the quality scores to zero, also set the base itself to 'N'. This is useful to visualize in IGV.
+ */
+ SET_Q_ZERO_BASE_N,
+ /**
+ * Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference.
+ */
+ REMOVE_REF_BIAS;
+
+ public static SOLID_RECAL_MODE recalModeFromString(String recalMode) {
+ if (recalMode.equals("DO_NOTHING"))
+ return SOLID_RECAL_MODE.DO_NOTHING;
+ if (recalMode.equals("SET_Q_ZERO"))
+ return SOLID_RECAL_MODE.SET_Q_ZERO;
+ if (recalMode.equals("SET_Q_ZERO_BASE_N"))
+ return SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N;
+ if (recalMode.equals("REMOVE_REF_BIAS"))
+ return SOLID_RECAL_MODE.REMOVE_REF_BIAS;
+
+ throw new UserException.BadArgumentValue(recalMode, "is not a valid SOLID_RECAL_MODE value");
+ }
+ }
+
+ public enum SOLID_NOCALL_STRATEGY {
+ /**
+ * When a no call is detected throw an exception to alert the user that recalibrating this SOLiD data is unsafe. This is the default option.
+ */
+ THROW_EXCEPTION,
+ /**
+ * Leave the read in the output bam completely untouched. This mode is only okay if the no calls are very rare.
+ */
+ LEAVE_READ_UNRECALIBRATED,
+ /**
+ * Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses.
+ */
+ PURGE_READ;
+
+ public static SOLID_NOCALL_STRATEGY nocallStrategyFromString(String nocallStrategy) {
+ if (nocallStrategy.equals("THROW_EXCEPTION"))
+ return SOLID_NOCALL_STRATEGY.THROW_EXCEPTION;
+ if (nocallStrategy.equals("LEAVE_READ_UNRECALIBRATED"))
+ return SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED;
+ if (nocallStrategy.equals("PURGE_READ"))
+ return SOLID_NOCALL_STRATEGY.PURGE_READ;
+
+ throw new UserException.BadArgumentValue(nocallStrategy, "is not a valid SOLID_NOCALL_STRATEGY value");
+ }
+ }
+
+ private static List generateReportTables(final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) {
+ List result = new LinkedList();
+ int reportTableIndex = 0;
+ int rowIndex = 0;
+ final Map covariateNameMap = new HashMap(requestedCovariates.length);
+ for (final Covariate covariate : requestedCovariates)
+ covariateNameMap.put(covariate, parseCovariateName(covariate));
+
+ for (int tableIndex = 0; tableIndex < recalibrationTables.numTables(); tableIndex++) {
+
+ final ArrayList> columnNames = new ArrayList>(); // initialize the array to hold the column names
+ columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[0]), "%s")); // save the required covariate name so we can reference it in the future
+ if (tableIndex != RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) {
+ columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[1]), "%s")); // save the required covariate name so we can reference it in the future
+ if (tableIndex >= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) {
+ columnNames.add(covariateValue);
+ columnNames.add(covariateName);
+ }
+ }
+
+ columnNames.add(eventType); // the order of these column names is important here
+ columnNames.add(empiricalQuality);
+ if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal())
+ columnNames.add(estimatedQReported); // only the read group table needs the estimated Q reported
+ columnNames.add(nObservations);
+ columnNames.add(nErrors);
+
+ final GATKReportTable reportTable;
+ if (tableIndex <= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) {
+ if(sortByCols) {
+ reportTable = new GATKReportTable("RecalTable" + reportTableIndex++, "", columnNames.size(), GATKReportTable.TableSortingWay.SORT_BY_COLUMN);
+ } else {
+ reportTable = new GATKReportTable("RecalTable" + reportTableIndex++, "", columnNames.size(), GATKReportTable.TableSortingWay.DO_NOT_SORT);
+ }
+ for (final Pair columnName : columnNames)
+ reportTable.addColumn(columnName.getFirst(), columnName.getSecond());
+ rowIndex = 0; // reset the row index since we're starting with a new table
+ } else {
+ reportTable = result.get(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal());
+ }
+
+ final NestedIntegerArray table = recalibrationTables.getTable(tableIndex);
+ for (final NestedIntegerArray.Leaf row : table.getAllLeaves()) {
+ final RecalDatum datum = (RecalDatum)row.value;
+ final int[] keys = row.keys;
+
+ int columnIndex = 0;
+ int keyIndex = 0;
+ reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), requestedCovariates[0].formatKey(keys[keyIndex++]));
+ if (tableIndex != RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) {
+ reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), requestedCovariates[1].formatKey(keys[keyIndex++]));
+ if (tableIndex >= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) {
+ final Covariate covariate = requestedCovariates[tableIndex];
+
+ reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), covariate.formatKey(keys[keyIndex++]));
+ reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), covariateNameMap.get(covariate));
+ }
+ }
+
+ final EventType event = EventType.eventFrom(keys[keyIndex]);
+ reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), event.toString());
+
+ reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEmpiricalQuality());
+ if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal())
+ reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table
+ reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getNumObservations());
+ reportTable.set(rowIndex, columnNames.get(columnIndex).getFirst(), datum.getNumMismatches());
+
+ rowIndex++;
+ }
+ result.add(reportTable);
+ }
+
+ return result;
+ }
+
+ private static String parseCovariateName(final Covariate covariate) {
+ return covariate.getClass().getSimpleName().split("Covariate")[0];
+ }
+
+ /**
+ * Return a human-readable string representing the used covariates
+ *
+ * @param requestedCovariates a vector of covariates
+ * @return a non-null comma-separated string
+ */
+ public static String covariateNames(final Covariate[] requestedCovariates) {
+ final List names = new ArrayList(requestedCovariates.length);
+ for ( final Covariate cov : requestedCovariates )
+ names.add(cov.getClass().getSimpleName());
+ return Utils.join(",", names);
+ }
+
+ /**
+ * Outputs the GATK report to RAC.RECAL_TABLE.
+ *
+ * @param RAC The list of shared command line arguments
+ * @param quantizationInfo Quantization info
+ * @param recalibrationTables Recalibration tables
+ * @param requestedCovariates The list of requested covariates
+ * @param sortByCols True to use GATKReportTable.TableSortingWay.SORT_BY_COLUMN, false to use GATKReportTable.TableSortingWay.DO_NOT_SORT
+ */
+ public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) {
+ final GATKReport report = createRecalibrationGATKReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols));
+ report.print(RAC.RECAL_TABLE);
+ }
+
+ /**
+ * Creates a consolidated GATK report, first generating report tables. Report can then be written to a stream via GATKReport.print(PrintStream).
+ *
+ * @param argumentTable Argument table
+ * @param quantizationInfo Quantization info
+ * @param recalibrationTables Recalibration tables
+ * @param requestedCovariates The list of requested covariates
+ * @param sortByCols True to use GATKReportTable.TableSortingWay.SORT_BY_COLUMN, false to use GATKReportTable.TableSortingWay.DO_NOT_SORT
+ * @return GATK report
+ */
+ public static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final boolean sortByCols) {
+ return createRecalibrationGATKReport(argumentTable, quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols));
+ }
+
+ /**
+ * Creates a consolidated GATK report from the tables. Report can then be written to a stream via GATKReport.print(PrintStream).
+ *
+ * @param argumentTable Argument table
+ * @param quantizationTable Quantization Table
+ * @param recalTables Other recal tables
+ * @return GATK report
+ */
+ private static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final GATKReportTable quantizationTable, final List recalTables) {
+ final GATKReport report = new GATKReport();
+ report.addTable(argumentTable);
+ report.addTable(quantizationTable);
+ report.addTables(recalTables);
+ return report;
+ }
+
+ /** s
+ * Write recalibration plots into a file
+ *
+ * @param csvFile location of the intermediary file
+ * @param exampleReportFile where the report arguments are collected from.
+ * @param output result plot file name.
+ */
+ public static void generatePlots(final File csvFile, final File exampleReportFile, final File output) {
+ final RScriptExecutor executor = new RScriptExecutor();
+ executor.setExceptOnError(true);
+ executor.addScript(new Resource(SCRIPT_FILE, RecalUtils.class));
+ executor.addArgs(csvFile.getAbsolutePath());
+ executor.addArgs(exampleReportFile.getAbsolutePath());
+ executor.addArgs(output.getAbsolutePath());
+ Logger.getLogger(RecalUtils.class).debug("R command line: " + executor.getApproximateCommandLine());
+ executor.exec();
+ }
+
+ private static void outputRecalibrationPlot(final File csvFile, final RecalibrationArgumentCollection RAC) {
+
+ final RScriptExecutor executor = new RScriptExecutor();
+ executor.addScript(new Resource(SCRIPT_FILE, RecalUtils.class));
+ executor.addArgs(csvFile.getAbsolutePath());
+ executor.addArgs(RAC.RECAL_TABLE_FILE.getAbsolutePath());
+ executor.exec();
+ }
+
+ /**
+ * Please use {@link #generateCsv(java.io.File, java.util.Map)} and {@link #generatePlots(java.io.File, java.io.File, java.io.File)} instead.
+ *
+ * @deprecated
+ */
+ @Deprecated
+ public static void generateRecalibrationPlot(final RecalibrationArgumentCollection RAC, final RecalibrationTables original, final Covariate[] requestedCovariates) {
+ generateRecalibrationPlot(RAC, original, null, requestedCovariates);
+ }
+
+ /**
+ * Please use {@link #generateCsv(java.io.File, java.util.Map)} and {@link #generatePlots(java.io.File, java.io.File, java.io.File)} instead.
+ *
+ * @deprecated
+ */
+ @Deprecated
+ public static void generateRecalibrationPlot(final RecalibrationArgumentCollection RAC, final RecalibrationTables original, final RecalibrationTables recalibrated, final Covariate[] requestedCovariates) {
+ final PrintStream csvStream;
+ final File csvTempFile = null;
+ try {
+ File csvTmpFile = File.createTempFile("BQSR",".csv");
+ csvTmpFile.deleteOnExit();
+ csvStream = new PrintStream(csvTmpFile);
+ } catch (IOException e) {
+ throw new UserException("Could not create temporary csv file", e);
+ }
+
+ if ( recalibrated != null )
+ writeCSV(csvStream, recalibrated, "RECALIBRATED", requestedCovariates, true);
+ writeCSV(csvStream, original, "ORIGINAL", requestedCovariates, recalibrated == null);
+ csvStream.close();
+ outputRecalibrationPlot(csvTempFile, RAC);
+ csvTempFile.delete();
+ }
+
+ private static void writeCSV(final PrintStream deltaTableFile, final RecalibrationTables recalibrationTables, final String recalibrationMode, final Covariate[] requestedCovariates, final boolean printHeader) {
+
+ final NestedIntegerArray deltaTable = createDeltaTable(recalibrationTables, requestedCovariates.length);
+
+ // add the quality score table to the delta table
+ final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable();
+ for (final NestedIntegerArray.Leaf leaf : qualTable.getAllLeaves()) { // go through every element in the covariates table to create the delta table
+ final int[] newCovs = new int[4];
+ newCovs[0] = leaf.keys[0];
+ newCovs[1] = requestedCovariates.length; // replace the covariate name with an arbitrary (unused) index for QualityScore
+ newCovs[2] = leaf.keys[1];
+ newCovs[3] = leaf.keys[2];
+ addToDeltaTable(deltaTable, newCovs, (RecalDatum)leaf.value); // add this covariate to the delta table
+ }
+
+ // add the optional covariates to the delta table
+ for (int i = RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal(); i < requestedCovariates.length; i++) {
+ final NestedIntegerArray covTable = recalibrationTables.getTable(i);
+ for (final NestedIntegerArray.Leaf leaf : covTable.getAllLeaves()) {
+ final int[] covs = new int[4];
+ covs[0] = leaf.keys[0];
+ covs[1] = i; // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS)
+ covs[2] = leaf.keys[2];
+ covs[3] = leaf.keys[3];
+ addToDeltaTable(deltaTable, covs, (RecalDatum) leaf.value); // add this covariate to the delta table
+ }
+ }
+
+ // output the csv file
+ if (printHeader) {
+ printHeader(deltaTableFile);
+ }
+
+ final Map covariateNameMap = new HashMap(requestedCovariates.length);
+ for (final Covariate covariate : requestedCovariates)
+ covariateNameMap.put(covariate, parseCovariateName(covariate));
+
+ // print each data line
+ for (final NestedIntegerArray.Leaf leaf : deltaTable.getAllLeaves()) {
+ final List