diff --git a/.gitignore b/.gitignore
index 65f111587..ac3b931eb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,3 +24,5 @@ dump/
lib/
out/
/atlassian-ide-plugin.xml
+maven-metadata-local.xml
+dependency-reduced-pom.xml
diff --git a/ant-bridge.sh b/ant-bridge.sh
new file mode 100755
index 000000000..9f4713d7c
--- /dev/null
+++ b/ant-bridge.sh
@@ -0,0 +1,173 @@
+#!/bin/sh
+
+mvn_args="verify"
+mvn_properties=
+mvn_clean=
+unknown_args=
+property_regex='-D(.*)=(.*)'
+unit_test_regex='.*UnitTest'
+post_script=
+run_type="run"
+
+for arg in "${@}" ; do
+ if [[ "${arg}" == "dry" ]] ; then
+ run_type="dry"
+
+ elif [[ "${arg}" == "clean" ]] ; then
+ mvn_clean="clean"
+ mvn_args=
+
+ elif [[ "${arg}" =~ ${property_regex} ]] ; then
+ property_name=${BASH_REMATCH[1]}
+ property_value=${BASH_REMATCH[2]}
+
+ if [[ "${property_name}" == "single" ]] ; then
+ test_property="test"
+ test_disabled="it.test"
+ if [[ ! "${property_value}" =~ ${unit_test_regex} ]] ; then
+ test_property="it.test"
+ test_disabled="test"
+ fi
+
+ mvn_properties="${mvn_properties} -D${test_disabled}=disabled -D${test_property}=${property_value}"
+
+ elif [[ "${property_name}" == "test.debug.port" ]] ; then
+ mvn_properties="${mvn_properties} -Dmaven.surefire.debug=\"-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=${property_value}\""
+ mvn_properties="${mvn_properties} -Dmaven.failsafe.debug=\"-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=${property_value}\""
+
+ elif [[ "${property_name}" == "test.default.maxmemory" ]] ; then
+ mvn_properties="${mvn_properties} -Dtest.maxmemory=${property_value}"
+
+ else
+ unknown_args="${unknown_args} \"${arg}\""
+
+ fi
+
+ else
+ if [[ "${arg}" != "dist" && "${mvn_args}" != "" && "${mvn_args}" != "verify" ]] ; then
+ echo "Sorry, this script does not currently support mixing targets." >&2
+ exit 1
+
+ elif [[ "${arg}" == "dist" ]] ; then
+ mvn_args="verify"
+
+ elif [[ "${arg}" == "gatk" ]] ; then
+ mvn_args="verify '-P!queue'"
+
+ elif [[ "${arg}" == "test.compile" ]] ; then
+ mvn_args="test-compile"
+
+ elif [[ "${arg}" == "gatkdocs" ]] ; then
+ local_repo="sitetemprepo"
+ mvn_args="install -Dmaven.repo.local=${local_repo} -Ddisable.queue && mvn site -Dmaven.repo.local=${local_repo} -Ddisable.queue"
+
+ elif [[ "${arg}" == "package.gatk.full" ]] ; then
+ mvn_args="package '-P!private,!queue'"
+
+ elif [[ "${arg}" == "package.gatk.all" ]] ; then
+ mvn_args="package '-P!queue'"
+
+ elif [[ "${arg}" == "package.queue.full" ]] ; then
+ mvn_args="package '-P!private'"
+
+ elif [[ "${arg}" == "package.queue.all" ]] ; then
+ mvn_args="package"
+
+# elif [[ "${arg}" == "release.gatk.full" ]] ; then
+# mvn_args="package '-P!private,!queue'"
+# post_script=" && private/src/main/scripts/shell/copy_release.sh public/gatk-package/target/GenomeAnalysisTK-*.tar.bz2"
+
+# elif [[ "${arg}" == "release.queue.full" ]] ; then
+# mvn_args="package '-P!private'"
+# post_script=" && private/src/main/scripts/shell/copy_release.sh public/queue-package/target/Queue-*.tar.bz2"
+
+ elif [[ "${arg}" == "build-picard-private" ]] ; then
+ mvn_args="mvn install -f private/picard-maven/pom.xml"
+
+ # TODO: clover support
+ # see ant and maven docs for clover:
+ # https://confluence.atlassian.com/display/CLOVER/1.+QuickStart+Guide
+ # https://confluence.atlassian.com/display/CLOVER/Clover-for-Maven+2+and+3+User%27s+Guide
+ #
+ #elif [[ "${arg}" == "clover.report" ]] ; then
+ # mvn_args=...
+ #
+ #elif [[ "${arg}" == "with.clover" ]] ; then
+ # mvn_args=...
+
+ # TODO: This runs *all* commit tests, including the few on Queue.
+ elif [[ "${arg}" == "gatkfull.binary.release.tests" ]] ; then
+ local_repo="sitetemprepo"
+ mvn_args="install -Dmaven.repo.local=${local_repo} && mvn verify"
+ mvn_args="${mvn_args} -Dmaven.repo.local=${local_repo}"
+ mvn_args="${mvn_args} -Dsting.packagetests.enabled=true"
+ mvn_args="${mvn_args} -Dsting.packagecommittests.skipped=false"
+
+ # TODO: This runs only the pipeline tests (full, non-dry run), but not the commit tests for Queue.
+ elif [[ "${arg}" == "queuefull.binary.release.tests" ]] ; then
+ local_repo="sitetemprepo"
+ mvn_args="install -Dmaven.repo.local=${local_repo} && mvn verify"
+ mvn_args="${mvn_args} -Dmaven.repo.local=${local_repo}"
+ mvn_args="${mvn_args} -Dsting.packagetests.enabled=true"
+ mvn_args="${mvn_args} -Dsting.packagepipelinetests.skipped=false"
+ mvn_args="${mvn_args} -Dsting.pipelinetests.run=true"
+
+ elif [[ "${arg}" == "committests" ]] ; then
+ mvn_args="verify -Dsting.committests.skipped=false"
+
+ elif [[ "${arg}" == "test" ]] ; then
+ mvn_args="test -Dsting.unittests.skipped=false"
+
+ elif [[ "${arg}" == "unittest" ]] ; then
+ mvn_args="test -Dsting.unittests.skipped=false"
+
+ elif [[ "${arg}" == "integrationtest" ]] ; then
+ mvn_args="verify -Dsting.integrationtests.skipped=false"
+
+ elif [[ "${arg}" == "largescaletest" ]] ; then
+ mvn_args="verify -Dsting.largescaletests.skipped=false"
+
+ elif [[ "${arg}" == "knowledgebasetest" ]] ; then
+ mvn_args="verify -Dsting.knowledgebasetests.skipped=false"
+
+ elif [[ "${arg}" == "pipelinetest" ]] ; then
+ mvn_args="verify -Dsting.pipelinetests.skipped=false"
+
+ elif [[ "${arg}" == "pipelinetestrun" ]] ; then
+ mvn_args="verify -Dsting.pipelinetests.skipped=false -Dsting.pipelinetests.run=true"
+
+ elif [[ "${arg}" == "fasttest" ]] ; then
+ mvn_args="verify -Dsting.committests.skipped=false -pl private/gatk-private -am -Dresource.bundle.skip=true"
+
+ else
+ unknown_args="${unknown_args} \"${arg}\""
+
+ fi
+
+ fi
+
+done
+
+mvn_cmd=
+if [[ "${mvn_clean}" != "" ]] ; then
+ if [[ "${mvn_args}" != "" ]] ; then
+ mvn_cmd="mvn ${mvn_clean} && mvn ${mvn_args}"
+ else
+ mvn_cmd="mvn ${mvn_clean}"
+ fi
+else
+ mvn_cmd="mvn ${mvn_args}"
+fi
+
+if [[ "${unknown_args}" != "" ]] ; then
+ echo "Unrecognized arguments:${unknown_args}" >&2
+
+else
+ echo "Equivalent maven command"
+ echo "${mvn_cmd}${mvn_properties}${post_script}"
+
+ if [[ "${run_type}" != "dry" ]] ; then
+ sh -c "${mvn_cmd}${mvn_properties}${post_script}"
+ fi
+
+fi
diff --git a/build.xml b/build.xml
deleted file mode 100644
index fd0801bfb..000000000
--- a/build.xml
+++ /dev/null
@@ -1,1518 +0,0 @@
-
-
-
-
- Compile and distribute the Sting toolkit
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Generating Queue GATK extensions...
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- Building Scala...
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/ivy.xml b/ivy.xml
deleted file mode 100644
index 2e45247ab..000000000
--- a/ivy.xml
+++ /dev/null
@@ -1,117 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/pom.xml b/pom.xml
new file mode 100644
index 000000000..d899506b5
--- /dev/null
+++ b/pom.xml
@@ -0,0 +1,858 @@
+
+
+ 4.0.0
+
+
+
+
+ org.broadinstitute.sting
+ sting-root
+ 3.0
+ public/sting-root
+
+
+ sting-aggregator
+ pom
+ Sting Aggregator
+
+
+ public
+
+
+
+
+ ${project.basedir}
+ StingText.properties
+ false
+
+ -build-timestamp "${maven.build.timestamp}"
+
+
+ package
+ generate-resources
+ process-resources
+ process-test-resources
+
+
+ true
+ ${sting.packagecommittests.skipped}
+ ${sting.packagecommittests.skipped}
+ ${sting.packagecommittests.skipped}
+ true
+ true
+
+
+ true
+ ${sting.serialcommittests.skipped}
+ ${sting.serialcommittests.skipped}
+ ${sting.serialcommittests.skipped}
+ true
+ true
+
+
+
+
+ com.sun
+ tools
+
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-clean-plugin
+
+
+
+
+ gatkdocs
+
+
+ ${basedir}
+
+ javadoc.sh
+ options
+ packages
+
+
+
+
+ ${basedir}
+
+ dependency-reduced-pom.xml
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-dependency-plugin
+
+
+ unpack-direct-dependencies
+
+ unpack-dependencies
+
+ none
+
+ true
+ ${project.build.outputDirectory}
+ jar
+ system
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-resources-plugin
+
+
+ default-resources
+
+ resources
+
+ ${sting.process-resources.phase}
+
+
+ default-testResources
+
+ testResources
+
+ ${sting.process-test-resources.phase}
+
+
+ copy-resource-bundle-log4j
+
+ copy-resources
+
+ none
+
+ ${project.reporting.outputDirectory}/apidocs
+
+
+ ${sting.basedir}/sting-utils/src/main/config/org/broadinstitute/sting/utils/help
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+
+
+ extract-resource-bundle
+
+ javadoc
+
+ none
+
+
+ ${resource.bundle.skip}
+ org.broadinstitute.sting.utils.help.ResourceBundleExtractorDoclet
+
+ ${project.build.outputDirectory}
+
+ ${project.groupId}
+
+ gatk-framework
+ ${project.version}
+
+ 2g
+ false
+ true
+ -build-timestamp "${maven.build.timestamp}" -absolute-version ${build.version} -out ${project.build.outputDirectory}/${resource.bundle.path}
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+
+ none
+
+ com.google.java.contract.core.apt.AnnotationProcessor
+
+
+
+
+ default-compile
+ none
+
+
+ default-testCompile
+ none
+
+
+
+ compile-package-info
+
+ compile
+
+ compile
+
+
+ -Xpkginfo:always
+
+
+ **/package-info.java
+
+
+
+
+
+ compile-java
+
+ compile
+
+ compile
+
+
+
+ **/package-info.java
+
+
+
+
+
+ testCompile-java
+
+ testCompile
+
+ test-compile
+
+
+
+
+
+ org.scala-tools
+ maven-scala-plugin
+
+
+
+ compile
+ testCompile
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-jar-plugin
+
+
+ default-jar
+ ${sting.jar.phase}
+
+
+ test-jar
+
+ test-jar
+
+ ${sting.jar.phase}
+
+ true
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-shade-plugin
+
+
+ sting-executable
+
+ shade
+
+ none
+
+ true
+
+
+ org.broadinstitute.sting:gsalib:tar.gz:*
+ org.broadinstitute.sting:*:tar.bz2:example-resources
+
+
+
+
+
+ ${app.main.class}
+
+
+
+ ${resource.bundle.path}
+
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-assembly-plugin
+
+
+ example-resources
+
+ single
+
+ none
+
+
+ src/main/assembly/example-resources.xml
+
+
+
+
+ binary-dist
+
+ single
+
+ none
+
+
+ src/main/assembly/binary-dist.xml
+
+
+
+
+
+
+
+
+ com.pyx4j
+ maven-junction-plugin
+
+
+ link-public-testdata
+
+ link
+
+ none
+
+
+
+ ${basedir}/public/testdata
+ ${sting.basedir}/public/gatk-framework/src/test/resources
+
+
+
+
+
+ unlink-public-testdata
+
+ unlink
+
+ none
+
+
+
+ ${basedir}/public/testdata
+ ${sting.basedir}/public/gatk-framework/src/test/resources
+
+
+
+
+
+ link-private-testdata
+
+ link
+
+ none
+
+
+
+ ${basedir}/private/testdata
+ ${sting.basedir}/private/gatk-private/src/test/resources
+
+
+
+
+
+ unlink-private-testdata
+
+ unlink
+
+ none
+
+
+
+ ${basedir}/private/testdata
+ ${sting.basedir}/private/gatk-private/src/test/resources
+
+
+
+
+
+ link-public-qscript
+
+ link
+
+ none
+
+
+
+ ${basedir}/public/scala/qscript
+ ${sting.basedir}/public/queue-framework/src/main/qscripts
+
+
+
+
+
+ unlink-public-qscript
+
+ unlink
+
+ none
+
+
+
+ ${basedir}/public/scala/qscript
+ ${sting.basedir}/public/queue-framework/src/main/qscripts
+
+
+
+
+
+ link-private-qscript
+
+ link
+
+ none
+
+
+
+ ${basedir}/private/scala/qscript
+ ${sting.basedir}/private/queue-private/src/main/qscripts
+
+
+
+
+
+ unlink-private-qscript
+
+ unlink
+
+ none
+
+
+
+ ${basedir}/private/scala/qscript
+ ${sting.basedir}/private/queue-private/src/main/qscripts
+
+
+
+
+
+ link-binary-jar
+
+ link
+
+ none
+
+
+
+ ${sting.basedir}/target/${sting.binary-dist.name}.${project.packaging}
+ ${project.build.directory}/${project.build.finalName}.${project.packaging}
+
+
+
+
+
+ link-git-release
+
+ link
+
+ none
+
+
+
+ ${project.build.directory}/${sting.binary-dist.name}-${build.version}.tar.bz2
+ ${project.build.directory}/${project.build.finalName}-binary-dist.tar.bz2
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-invoker-plugin
+
+ true
+ false
+ ${sting.basedir}/public/package-tests/pom.xml
+ true
+ true
+ ${sting.basedir}/${maven.repo.local}
+
+ ${test}
+ ${it.test}
+ false
+ false
+ ${sting.packagetests.artifactId}
+ ${project.build.testOutputDirectory}
+ ${project.basedir}
+ ${sting.pipelinetests.run}
+ ${maven.surefire.debug}
+ ${maven.failsafe.debug}
+
+
+
+
+
+ package-unittests
+
+ run
+
+
+
+ test
+
+ ${project.build.directory}/invoker-reports/unit/${test}
+ ${sting.packageunittests.skipped}
+
+ true
+ ${sting.packageunittests.skipped}
+
+
+
+
+ package-integrationtests
+
+ integration-test
+ verify
+
+
+
+ verify
+
+ ${project.build.directory}/invoker-reports/integration/${it.test}
+ ${sting.packageintegrationtests.skipped}
+
+ true
+ ${sting.packageintegrationtests.skipped}
+
+
+
+
+ package-pipelinetests
+
+ integration-test
+ verify
+
+
+
+ verify
+
+ ${project.build.directory}/invoker-reports/pipeline/${it.test}
+ ${sting.packagepipelinetests.skipped}
+
+ true
+ ${sting.packagepipelinetests.skipped}
+
+
+
+
+ package-largescaletests
+
+ integration-test
+ verify
+
+
+
+ verify
+
+ ${project.build.directory}/invoker-reports/largescale/${it.test}
+ ${sting.packagelargescaletests.skipped}
+
+ true
+ ${sting.packagelargescaletests.skipped}
+
+
+
+
+ package-knowledgebasetests
+
+ integration-test
+ verify
+
+
+
+ verify
+
+ ${project.build.directory}/invoker-reports/knowledgebase/${it.test}
+ ${sting.packageknowledgebasetests.skipped}
+
+ true
+ ${sting.packageknowledgebasetests.skipped}
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-install-plugin
+ 2.5
+
+
+ install-package
+
+ install-file
+
+ none
+
+ true
+ ${project.groupId}
+ ${project.artifactId}
+ ${project.version}
+ ${project.packaging}
+ ${project.build.directory}/${project.build.finalName}.${project.packaging}
+
+
+
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-failsafe-plugin
+
+
+ com.pyx4j
+ maven-junction-plugin
+
+
+ link-public-testdata
+ process-test-resources
+
+
+ unlink-public-testdata
+ clean
+
+
+ link-public-qscript
+ process-test-resources
+
+
+ unlink-public-qscript
+ clean
+
+
+
+
+ org.apache.maven.plugins
+ maven-clean-plugin
+
+
+ com.google.code.sortpom
+ maven-sortpom-plugin
+
+
+ package-tests
+
+ sort
+
+ verify
+ false
+
+ public/package-tests/pom.xml
+
+
+
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+ 2.9.1
+
+
+
+
+
+
+ generate-gatk-docs
+
+ aggregate
+
+
+ false
+
+ org.broadinstitute.sting.utils.help.GATKDoclet
+
+ ${project.groupId}
+ gatk-package
+ ${project.version}
+
+ false
+ true
+ private
+ -build-timestamp "${maven.build.timestamp}" -absolute-version ${build.version} ${gatkdocs.include.hidden} -settings-dir ${sting.basedir}/settings/helpTemplates -destination-dir ${project.build.directory}/gatkdocs
+
+
+
+
+
+
+
+
+
+
+ protected
+
+
+ ${basedir}/protected/pom.xml
+
+
+
+ protected
+
+
+
+
+
+ private
+
+
+ ${basedir}/private/pom.xml
+
+
+
+ private
+
+
+
+
+
+ com.pyx4j
+ maven-junction-plugin
+
+
+ link-private-testdata
+ process-test-resources
+
+
+ unlink-private-testdata
+ clean
+
+
+ link-private-qscript
+ process-test-resources
+
+
+ unlink-private-qscript
+ clean
+
+
+
+
+
+
+
+
+
+ packagetests-enabled
+
+
+ sting.packagetests.enabled
+ true
+
+
+
+ true
+ true
+ none
+ none
+ none
+ none
+
+
+
+
+
diff --git a/protected/gatk-protected/pom.xml b/protected/gatk-protected/pom.xml
new file mode 100644
index 000000000..26aabd187
--- /dev/null
+++ b/protected/gatk-protected/pom.xml
@@ -0,0 +1,139 @@
+
+
+ 4.0.0
+
+
+ org.broadinstitute.sting
+ sting-aggregator
+ 3.0
+ ../..
+
+
+ gatk-protected
+ jar
+ GATK Protected
+
+
+ ${project.basedir}/../..
+ gatk-package
+
+
+
+
+ ${project.groupId}
+ gatk-framework
+ ${project.version}
+
+
+
+ net.sf.jgrapht
+ jgrapht
+
+
+
+ gov.nist.math
+ jama
+
+
+
+ it.unimi.dsi
+ fastutil
+
+
+
+ ${project.groupId}
+ gatk-framework
+ ${project.version}
+ test-jar
+ test
+
+
+
+ org.testng
+ testng
+ test
+
+
+ com.google.caliper
+ caliper
+ test
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-resources-plugin
+
+
+ copy-resource-bundle-log4j
+ prepare-package
+
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+
+
+ extract-resource-bundle
+ prepare-package
+
+
+
+
+ org.apache.maven.plugins
+ maven-invoker-plugin
+
+
+ package-unittests
+
+
+ package-integrationtests
+
+
+ package-largescaletests
+
+
+ package-knowledgebasetests
+
+
+ package-pipelinetests
+
+
+
+
+
+
+
+
+ private
+
+
+ ${basedir}/../../private/gatk-private/pom.xml
+
+
+
+
+
+
+ com.pyx4j
+ maven-junction-plugin
+
+
+ link-private-testdata
+ process-test-resources
+
+
+ unlink-private-testdata
+ clean
+
+
+
+
+
+
+
+
+
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java
new file mode 100644
index 000000000..29cee9e15
--- /dev/null
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java
@@ -0,0 +1,114 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.annotator;
+
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.variant.vcf.VCFConstants;
+import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
+import org.broadinstitute.variant.vcf.VCFStandardHeaderLines;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Total (unfiltered) depth over all samples.
+ *
+ *
While the sample-level (FORMAT) DP field describes the total depth of reads that passed the caller's
+ * internal quality control metrics (like MAPQ > 17, for example), the INFO field DP represents the unfiltered depth
+ * over all samples. Note though that the DP is affected by downsampling (-dcov), so the max value one can obtain for
+ * N samples with -dcov D is N * D
+ *
+ */
+public class Coverage extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
+
+ public Map annotate(final RefMetaDataTracker tracker,
+ final AnnotatorCompatible walker,
+ final ReferenceContext ref,
+ final Map stratifiedContexts,
+ final VariantContext vc,
+ final Map perReadAlleleLikelihoodMap ) {
+
+ int depth = 0;
+ if (stratifiedContexts != null) {
+ if ( stratifiedContexts.size() == 0 )
+ return null;
+
+ for ( Map.Entry sample : stratifiedContexts.entrySet() )
+ depth += sample.getValue().getBasePileup().depthOfCoverage();
+ }
+ else if (perReadAlleleLikelihoodMap != null) {
+ if ( perReadAlleleLikelihoodMap.size() == 0 )
+ return null;
+
+ for (PerReadAlleleLikelihoodMap maps : perReadAlleleLikelihoodMap.values() ) {
+ depth += maps.getLikelihoodReadMap().size();
+ }
+ }
+ else
+ return null;
+
+ Map map = new HashMap();
+ map.put(getKeyNames().get(0), String.format("%d", depth));
+ return map;
+ }
+
+ public List getKeyNames() { return Arrays.asList(VCFConstants.DEPTH_KEY); }
+
+ public List getDescriptions() {
+ return Arrays.asList(VCFStandardHeaderLines.getInfoLine(getKeyNames().get(0)));
+ }
+}
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java
new file mode 100644
index 000000000..52b09d251
--- /dev/null
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java
@@ -0,0 +1,164 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.annotator;
+
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
+import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.variant.vcf.VCFConstants;
+import org.broadinstitute.variant.vcf.VCFFormatHeaderLine;
+import org.broadinstitute.variant.vcf.VCFStandardHeaderLines;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.variant.variantcontext.Allele;
+import org.broadinstitute.variant.variantcontext.Genotype;
+import org.broadinstitute.variant.variantcontext.GenotypeBuilder;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+
+import java.util.*;
+
+
+/**
+ * The depth of coverage of each allele per sample
+ *
+ *
The AD and DP are complementary fields that are two important ways of thinking about the depth of the data for this
+ * sample at this site. While the sample-level (FORMAT) DP field describes the total depth of reads that passed the
+ * caller's internal quality control metrics (like MAPQ > 17, for example), the AD values (one for each of
+ * REF and ALT fields) is the unfiltered count of all reads that carried with them the
+ * REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the
+ * power I have to determine the genotype of the sample at this site, while the AD tells me how many times
+ * I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering
+ * the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like
+ * to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would
+ * normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that
+ * the AD isn't necessarily calculated exactly for indels. Only reads which are statistically favoring one allele over the other are counted.
+ * Because of this fact, the sum of AD may be different than the individual sample depth, especially when there are
+ * many non-informative reads.
+ *
+ *
Because the AD includes reads and bases that were filtered by the caller and in case of indels is based on a statistical computation,
+ * one should not base assumptions about the underlying genotype based on it;
+ * instead, the genotype likelihoods (PLs) are what determine the genotype calls.
+ *
+ */
+public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation {
+
+ public void annotate(final RefMetaDataTracker tracker,
+ final AnnotatorCompatible walker,
+ final ReferenceContext ref,
+ final AlignmentContext stratifiedContext,
+ final VariantContext vc,
+ final Genotype g,
+ final GenotypeBuilder gb,
+ final PerReadAlleleLikelihoodMap alleleLikelihoodMap) {
+ if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) )
+ return;
+
+ if (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty())
+ annotateWithLikelihoods(alleleLikelihoodMap, vc, gb);
+ else if ( stratifiedContext != null && (vc.isSNP()))
+ annotateWithPileup(stratifiedContext, vc, gb);
+ }
+
+ private void annotateWithPileup(final AlignmentContext stratifiedContext, final VariantContext vc, final GenotypeBuilder gb) {
+
+ final HashMap alleleCounts = new HashMap<>();
+ for ( final Allele allele : vc.getAlleles() )
+ alleleCounts.put(allele.getBases()[0], 0);
+
+ final ReadBackedPileup pileup = stratifiedContext.getBasePileup();
+ for ( final PileupElement p : pileup ) {
+ if ( alleleCounts.containsKey(p.getBase()) )
+ alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1);
+ }
+
+ // we need to add counts in the correct order
+ final int[] counts = new int[alleleCounts.size()];
+ counts[0] = alleleCounts.get(vc.getReference().getBases()[0]);
+ for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
+ counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]);
+
+ gb.AD(counts);
+ }
+
+ private void annotateWithLikelihoods(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final VariantContext vc, final GenotypeBuilder gb) {
+ final Set alleles = new HashSet<>(vc.getAlleles());
+
+ // make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext
+ if ( ! perReadAlleleLikelihoodMap.getAllelesSet().containsAll(alleles) )
+ throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + perReadAlleleLikelihoodMap.getAllelesSet());
+
+ final HashMap alleleCounts = new HashMap<>();
+ for ( final Allele allele : vc.getAlleles() ) { alleleCounts.put(allele, 0); }
+
+ for ( final Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
+ final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles);
+ if (! a.isInformative() ) continue; // read is non-informative
+ final GATKSAMRecord read = el.getKey();
+ final int prevCount = alleleCounts.get(a.getMostLikelyAllele());
+ alleleCounts.put(a.getMostLikelyAllele(), prevCount + 1);
+ }
+
+ final int[] counts = new int[alleleCounts.size()];
+ counts[0] = alleleCounts.get(vc.getReference());
+ for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
+ counts[i+1] = alleleCounts.get( vc.getAlternateAllele(i) );
+
+ gb.AD(counts);
+ }
+
+ public List getKeyNames() { return Arrays.asList(VCFConstants.GENOTYPE_ALLELE_DEPTHS); }
+
+ public List getDescriptions() {
+ return Arrays.asList(VCFStandardHeaderLines.getFormatLine(getKeyNames().get(0)));
+ }
+}
\ No newline at end of file
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java
new file mode 100644
index 000000000..8e5ca83e0
--- /dev/null
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java
@@ -0,0 +1,126 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.annotator;
+
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
+import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.variant.variantcontext.Allele;
+import org.broadinstitute.variant.variantcontext.Genotype;
+import org.broadinstitute.variant.variantcontext.GenotypeBuilder;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+import org.broadinstitute.variant.vcf.VCFConstants;
+import org.broadinstitute.variant.vcf.VCFFormatHeaderLine;
+import org.broadinstitute.variant.vcf.VCFStandardHeaderLines;
+
+import java.util.*;
+
+
+/**
+ * The depth of coverage for informative reads for each sample.
+ *
+ * An informative read is defined as one from which the allele it carries can be easily distinguished. An example of a
+ * case where a read might be uninformative is where it only partially overlaps a short tandem repeat and it is not clear
+ * whether the read contains the reference allele or e.g. an extra repeat.
+ * The depth here is the sum of the informative reads at this site as determined by the Haplotype Caller; as such it can
+ * only be calculated and generated through the Haplotype Caller (it will not work when run through the Variant Annotator).
+ * This calculation is not perfect but it is a pretty good proxy for depth and it does match the values in the AD field
+ * (i.e., sum(AD) = DP).
+ */
+public class DepthPerSampleHC extends GenotypeAnnotation {
+ public void annotate(final RefMetaDataTracker tracker,
+ final AnnotatorCompatible walker,
+ final ReferenceContext ref,
+ final AlignmentContext stratifiedContext,
+ final VariantContext vc,
+ final Genotype g,
+ final GenotypeBuilder gb,
+ final PerReadAlleleLikelihoodMap alleleLikelihoodMap) {
+ if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) )
+ return;
+
+ if (alleleLikelihoodMap == null )
+ throw new IllegalStateException("DepthPerSampleHC can only be used with likelihood based annotations in the HaplotypeCaller");
+
+ // the depth for the HC is the sum of the informative alleles at this site. It's not perfect (as we cannot
+ // differentiate between reads that align over the event but aren't informative vs. those that aren't even
+ // close) but it's a pretty good proxy and it matches with the AD field (i.e., sum(AD) = DP).
+ int dp = 0;
+
+ if ( alleleLikelihoodMap.isEmpty() ) {
+ // there are no reads
+ } else {
+ final Set alleles = new HashSet<>(vc.getAlleles());
+
+ // make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext
+ if ( ! alleleLikelihoodMap.getAllelesSet().containsAll(alleles) )
+ throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + alleleLikelihoodMap.getAllelesSet());
+
+ for (Map.Entry> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
+ final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles);
+ if ( a.isInformative() ) {
+ dp++;
+ }
+ }
+
+ gb.DP(dp);
+ }
+ }
+
+ public List getKeyNames() {
+ return Collections.singletonList(VCFConstants.DEPTH_KEY);
+ }
+
+ public List getDescriptions() {
+ return Collections.singletonList(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY));
+ }
+}
\ No newline at end of file
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java
new file mode 100644
index 000000000..a90f555a1
--- /dev/null
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java
@@ -0,0 +1,509 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.annotator;
+
+import cern.jet.math.Arithmetic;
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
+import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.sting.utils.QualityUtils;
+import org.broadinstitute.variant.variantcontext.Genotype;
+import org.broadinstitute.variant.variantcontext.GenotypesContext;
+import org.broadinstitute.variant.vcf.VCFHeaderLineType;
+import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.variant.variantcontext.Allele;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+
+import java.util.*;
+
+
+/**
+ * Phred-scaled p-value using Fisher's Exact Test to detect strand bias
+ *
+ *
Phred-scaled p-value using Fisher's Exact Test to detect strand bias (the variation
+ * being seen on only the forward or only the reverse strand) in the reads. More bias is
+ * indicative of false positive calls.
+ *
+ *
+ *
Caveat
+ *
The Fisher Strand test may not be calculated for certain complex indel cases or for multi-allelic sites.
+ */
+public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
+ private final static boolean ENABLE_DEBUGGING = false;
+ private final static Logger logger = Logger.getLogger(FisherStrand.class);
+
+ private static final String FS = "FS";
+ private static final double MIN_PVALUE = 1E-320;
+ private static final int MIN_QUAL_FOR_FILTERED_TEST = 17;
+ private static final int MIN_COUNT = 2;
+
+ public Map annotate(final RefMetaDataTracker tracker,
+ final AnnotatorCompatible walker,
+ final ReferenceContext ref,
+ final Map stratifiedContexts,
+ final VariantContext vc,
+ final Map stratifiedPerReadAlleleLikelihoodMap) {
+ if ( !vc.isVariant() )
+ return null;
+
+ if ( vc.hasGenotypes() ) {
+ final int[][] tableFromPerSampleAnnotations = getTableFromSamples( vc.getGenotypes() );
+ if ( tableFromPerSampleAnnotations != null ) {
+ return pValueForBestTable(tableFromPerSampleAnnotations, null);
+ }
+ }
+
+ if (vc.isSNP() && stratifiedContexts != null) {
+ final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), -1);
+ final int[][] tableFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), MIN_QUAL_FOR_FILTERED_TEST);
+ printTable("unfiltered", tableNoFiltering);
+ printTable("filtered", tableFiltering);
+ return pValueForBestTable(tableFiltering, tableNoFiltering);
+ }
+ else if (stratifiedPerReadAlleleLikelihoodMap != null) {
+ // either SNP with no alignment context, or indels: per-read likelihood map needed
+ final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc);
+// logger.info("VC " + vc);
+// printTable(table, 0.0);
+ return pValueForBestTable(table, null);
+ }
+ else
+ // for non-snp variants, we need per-read likelihoods.
+ // for snps, we can get same result from simple pileup
+ return null;
+ }
+
+ /**
+ * Create the FisherStrand table by retrieving the per-sample strand bias annotation and adding them together
+ * @param genotypes the genotypes from which to pull out the per-sample strand bias annotation
+ * @return the table used for the FisherStrand p-value calculation, will be null if none of the genotypes contain the per-sample SB annotation
+ */
+ private int[][] getTableFromSamples( final GenotypesContext genotypes ) {
+ if( genotypes == null ) { throw new IllegalArgumentException("Genotypes cannot be null."); }
+
+ final int[] sbArray = {0,0,0,0}; // reference-forward-reverse -by- alternate-forward-reverse
+ boolean foundData = false;
+
+ for( final Genotype g : genotypes ) {
+ if( g.isNoCall() || ! g.hasAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME) )
+ continue;
+
+ foundData = true;
+ final String sbbsString = (String) g.getAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME);
+ final int[] data = encodeSBBS(sbbsString);
+ if ( passesMinimumThreshold(data) ) {
+ for( int index = 0; index < sbArray.length; index++ ) {
+ sbArray[index] += data[index];
+ }
+ }
+ }
+
+ return ( foundData ? decodeSBBS(sbArray) : null );
+ }
+
+ /**
+ * Does this strand data array pass the minimum threshold for inclusion?
+ *
+ * @param data the array
+ * @return true if it passes the minimum threshold, false otherwise
+ */
+ private static boolean passesMinimumThreshold(final int[] data) {
+ // the ref and alt totals must each be greater than MIN_COUNT
+ return data[0] + data[1] > MIN_COUNT && data[2] + data[3] > MIN_COUNT;
+ }
+
+ /**
+ * Create an annotation for the highest (i.e., least significant) p-value of table1 and table2
+ *
+ * @param table1 a contingency table, may be null
+ * @param table2 a contingency table, may be null
+ * @return annotation result for FS given tables
+ */
+ private Map pValueForBestTable(final int[][] table1, final int[][] table2) {
+ if ( table2 == null )
+ return table1 == null ? null : annotationForOneTable(pValueForContingencyTable(table1));
+ else if (table1 == null)
+ return annotationForOneTable(pValueForContingencyTable(table2));
+ else { // take the one with the best (i.e., least significant pvalue)
+ double pvalue1 = pValueForContingencyTable(table1);
+ double pvalue2 = pValueForContingencyTable(table2);
+ return annotationForOneTable(Math.max(pvalue1, pvalue2));
+ }
+ }
+
+ /**
+ * Returns an annotation result given a pValue
+ *
+ * @param pValue
+ * @return a hash map from FS -> phred-scaled pValue
+ */
+ private Map annotationForOneTable(final double pValue) {
+ final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(Math.max(pValue, MIN_PVALUE))); // prevent INFINITYs
+ return Collections.singletonMap(FS, value);
+ }
+
+ public List getKeyNames() {
+ return Collections.singletonList(FS);
+ }
+
+ public List getDescriptions() {
+ return Collections.singletonList(new VCFInfoHeaderLine(FS, 1, VCFHeaderLineType.Float, "Phred-scaled p-value using Fisher's exact test to detect strand bias"));
+ }
+
+ /**
+ * Helper function to turn the FisherStrand table into the SB annotation array
+ * @param table the table used by the FisherStrand annotation
+ * @return the array used by the per-sample Strand Bias annotation
+ */
+ public static List getContingencyArray( final int[][] table ) {
+ if(table.length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); }
+ if(table[0].length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); }
+ final List list = new ArrayList<>(4); // TODO - if we ever want to do something clever with multi-allelic sites this will need to change
+ list.add(table[0][0]);
+ list.add(table[0][1]);
+ list.add(table[1][0]);
+ list.add(table[1][1]);
+ return list;
+ }
+
+ /**
+ * Helper function to parse the genotype annotation into the SB annotation array
+ * @param string the string that is returned by genotype.getAnnotation("SB")
+ * @return the array used by the per-sample Strand Bias annotation
+ */
+ private static int[] encodeSBBS( final String string ) {
+ final int[] array = new int[4];
+ final StringTokenizer tokenizer = new StringTokenizer(string, ",", false);
+ for( int index = 0; index < 4; index++ ) {
+ array[index] = Integer.parseInt(tokenizer.nextToken());
+ }
+ return array;
+ }
+
+ /**
+ * Helper function to turn the SB annotation array into the FisherStrand table
+ * @param array the array used by the per-sample Strand Bias annotation
+ * @return the table used by the FisherStrand annotation
+ */
+ private static int[][] decodeSBBS( final int[] array ) {
+ if(array.length != 4) { throw new IllegalArgumentException("Expecting a length = 4 strand bias array."); }
+ final int[][] table = new int[2][2];
+ table[0][0] = array[0];
+ table[0][1] = array[1];
+ table[1][0] = array[2];
+ table[1][1] = array[3];
+ return table;
+ }
+
+ private Double pValueForContingencyTable(int[][] originalTable) {
+ final int[][] normalizedTable = normalizeContingencyTable(originalTable);
+
+ int[][] table = copyContingencyTable(normalizedTable);
+
+ double pCutoff = computePValue(table);
+ //printTable(table, pCutoff);
+
+ double pValue = pCutoff;
+ while (rotateTable(table)) {
+ double pValuePiece = computePValue(table);
+
+ //printTable(table, pValuePiece);
+
+ if (pValuePiece <= pCutoff) {
+ pValue += pValuePiece;
+ }
+ }
+
+ table = copyContingencyTable(normalizedTable);
+ while (unrotateTable(table)) {
+ double pValuePiece = computePValue(table);
+
+ //printTable(table, pValuePiece);
+
+ if (pValuePiece <= pCutoff) {
+ pValue += pValuePiece;
+ }
+ }
+
+ //System.out.printf("P-cutoff: %f\n", pCutoff);
+ //System.out.printf("P-value: %f\n\n", pValue);
+
+ // min is necessary as numerical precision can result in pValue being slightly greater than 1.0
+ return Math.min(pValue, 1.0);
+ }
+
+ // how large do we want the normalized table to be?
+ private static final double TARGET_TABLE_SIZE = 200.0;
+
+ /**
+ * Normalize the table so that the entries are not too large.
+ * Note that this method does NOT necessarily make a copy of the table being passed in!
+ *
+ * @param table the original table
+ * @return a normalized version of the table or the original table if it is already normalized
+ */
+ private static int[][] normalizeContingencyTable(final int[][] table) {
+ final int sum = table[0][0] + table[0][1] + table[1][0] + table[1][1];
+ if ( sum <= TARGET_TABLE_SIZE * 2 )
+ return table;
+
+ final double normalizationFactor = (double)sum / TARGET_TABLE_SIZE;
+
+ final int[][] normalized = new int[2][2];
+ for ( int i = 0; i < 2; i++ ) {
+ for ( int j = 0; j < 2; j++ )
+ normalized[i][j] = (int)(table[i][j] / normalizationFactor);
+ }
+
+ return normalized;
+ }
+
+ private static int [][] copyContingencyTable(int [][] t) {
+ int[][] c = new int[2][2];
+
+ for ( int i = 0; i < 2; i++ )
+ for ( int j = 0; j < 2; j++ )
+ c[i][j] = t[i][j];
+
+ return c;
+ }
+
+
+ private static void printTable(int[][] table, double pValue) {
+ logger.info(String.format("%d %d; %d %d : %f", table[0][0], table[0][1], table[1][0], table[1][1], pValue));
+ }
+
+ /**
+ * Printing information to logger.info for debugging purposes
+ *
+ * @param name the name of the table
+ * @param table the table itself
+ */
+ private void printTable(final String name, final int[][] table) {
+ if ( ENABLE_DEBUGGING ) {
+ final String pValue = (String)annotationForOneTable(pValueForContingencyTable(table)).get(FS);
+ logger.info(String.format("FS %s (REF+, REF-, ALT+, ALT-) = (%d, %d, %d, %d) = %s",
+ name, table[0][0], table[0][1], table[1][0], table[1][1], pValue));
+ }
+ }
+
+ private static boolean rotateTable(int[][] table) {
+ table[0][0] -= 1;
+ table[1][0] += 1;
+
+ table[0][1] += 1;
+ table[1][1] -= 1;
+
+ return (table[0][0] >= 0 && table[1][1] >= 0);
+ }
+
+ private static boolean unrotateTable(int[][] table) {
+ table[0][0] += 1;
+ table[1][0] -= 1;
+
+ table[0][1] -= 1;
+ table[1][1] += 1;
+
+ return (table[0][1] >= 0 && table[1][0] >= 0);
+ }
+
+ private static double computePValue(int[][] table) {
+
+ int[] rowSums = { sumRow(table, 0), sumRow(table, 1) };
+ int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) };
+ int N = rowSums[0] + rowSums[1];
+
+ // calculate in log space so we don't die with high numbers
+ double pCutoff = Arithmetic.logFactorial(rowSums[0])
+ + Arithmetic.logFactorial(rowSums[1])
+ + Arithmetic.logFactorial(colSums[0])
+ + Arithmetic.logFactorial(colSums[1])
+ - Arithmetic.logFactorial(table[0][0])
+ - Arithmetic.logFactorial(table[0][1])
+ - Arithmetic.logFactorial(table[1][0])
+ - Arithmetic.logFactorial(table[1][1])
+ - Arithmetic.logFactorial(N);
+ return Math.exp(pCutoff);
+ }
+
+ private static int sumRow(int[][] table, int column) {
+ int sum = 0;
+ for (int r = 0; r < table.length; r++) {
+ sum += table[r][column];
+ }
+
+ return sum;
+ }
+
+ private static int sumColumn(int[][] table, int row) {
+ int sum = 0;
+ for (int c = 0; c < table[row].length; c++) {
+ sum += table[row][c];
+ }
+
+ return sum;
+ }
+
+ /**
+ Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this:
+ * fw rc
+ * allele1 # #
+ * allele2 # #
+ * @return a 2x2 contingency table
+ */
+ public static int[][] getContingencyTable( final Map stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) {
+ if( stratifiedPerReadAlleleLikelihoodMap == null ) { throw new IllegalArgumentException("stratifiedPerReadAlleleLikelihoodMap cannot be null"); }
+ if( vc == null ) { throw new IllegalArgumentException("input vc cannot be null"); }
+
+ final Allele ref = vc.getReference();
+ final Allele alt = vc.getAltAlleleWithHighestAlleleCount();
+ final int[][] table = new int[2][2];
+
+ for (final PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) {
+ final int[] myTable = new int[4];
+ for (final Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) {
+ final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
+ final GATKSAMRecord read = el.getKey();
+ updateTable(myTable, mostLikelyAllele.getAlleleIfInformative(), read, ref, alt);
+ }
+ if ( passesMinimumThreshold(myTable) )
+ copyToMainTable(myTable, table);
+ }
+
+ return table;
+ }
+
+ /**
+ * Helper method to copy the per-sample table to the main table
+ *
+ * @param perSampleTable per-sample table (single dimension)
+ * @param mainTable main table (two dimensions)
+ */
+ private static void copyToMainTable(final int[] perSampleTable, final int[][] mainTable) {
+ mainTable[0][0] += perSampleTable[0];
+ mainTable[0][1] += perSampleTable[1];
+ mainTable[1][0] += perSampleTable[2];
+ mainTable[1][1] += perSampleTable[3];
+ }
+
+ /**
+ Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this:
+ * fw rc
+ * allele1 # #
+ * allele2 # #
+ * @return a 2x2 contingency table
+ */
+ private static int[][] getSNPContingencyTable(final Map stratifiedContexts,
+ final Allele ref,
+ final Allele alt,
+ final int minQScoreToConsider ) {
+ int[][] table = new int[2][2];
+
+ for ( Map.Entry sample : stratifiedContexts.entrySet() ) {
+ final int[] myTable = new int[4];
+ for (PileupElement p : sample.getValue().getBasePileup()) {
+
+ if ( ! isUsableBase(p) ) // ignore deletions and bad MQ
+ continue;
+
+ if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider )
+ continue;
+
+ updateTable(myTable, Allele.create(p.getBase(), false), p.getRead(), ref, alt);
+ }
+ if ( passesMinimumThreshold(myTable) )
+ copyToMainTable(myTable, table);
+ }
+
+ return table;
+ }
+
+ /**
+ * Can the base in this pileup element be used in comparative tests?
+ *
+ * @param p the pileup element to consider
+ *
+ * @return true if this base is part of a meaningful read for comparison, false otherwise
+ */
+ private static boolean isUsableBase(final PileupElement p) {
+ return !( p.isDeletion() ||
+ p.getMappingQual() == 0 ||
+ p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ||
+ ((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE);
+ }
+
+ private static void updateTable(final int[] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final Allele alt) {
+
+ final boolean matchesRef = allele.equals(ref, true);
+ final boolean matchesAlt = allele.equals(alt, true);
+
+ if ( matchesRef || matchesAlt ) {
+ final int offset = matchesRef ? 0 : 2;
+
+ if ( read.isStrandless() ) {
+ // a strandless read counts as observations on both strand, at 50% weight, with a minimum of 1
+ // (the 1 is to ensure that a strandless read always counts as an observation on both strands, even
+ // if the read is only seen once, because it's a merged read or other)
+ table[offset]++;
+ table[offset + 1]++;
+ } else {
+ // a normal read with an actual strand
+ final boolean isFW = !read.getReadNegativeStrandFlag();
+ table[offset + (isFW ? 0 : 1)]++;
+ }
+ }
+ }
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LikelihoodRankSumTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/LikelihoodRankSumTest.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LikelihoodRankSumTest.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/LikelihoodRankSumTest.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java
new file mode 100644
index 000000000..7ebbd49dd
--- /dev/null
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java
@@ -0,0 +1,191 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.annotator;
+
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
+import org.broadinstitute.variant.vcf.VCFHeaderLineType;
+import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
+import org.broadinstitute.variant.variantcontext.Genotype;
+import org.broadinstitute.variant.variantcontext.GenotypesContext;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+
+import java.util.*;
+
+/**
+ * Variant confidence (from the QUAL field) / unfiltered depth of non-reference samples. Note that the QD is also normalized by event length.
+ *
+ * Low scores are indicative of false positive calls and artifacts. Note that QualByDepth requires sequencing
+ * reads associated with the samples with polymorphic genotypes.
+ */
+public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
+// private final static Logger logger = Logger.getLogger(QualByDepth.class);
+
+ public Map annotate(final RefMetaDataTracker tracker,
+ final AnnotatorCompatible walker,
+ final ReferenceContext ref,
+ final Map stratifiedContexts,
+ final VariantContext vc,
+ final Map perReadAlleleLikelihoodMap ) {
+ if ( !vc.hasLog10PError() )
+ return null;
+
+ final GenotypesContext genotypes = vc.getGenotypes();
+ if ( genotypes == null || genotypes.size() == 0 )
+ return null;
+
+ int standardDepth = 0;
+ int ADrestrictedDepth = 0;
+
+ for ( final Genotype genotype : genotypes ) {
+
+ // we care only about variant calls with likelihoods
+ if ( !genotype.isHet() && !genotype.isHomVar() )
+ continue;
+
+ // if we have the AD values for this sample, let's make sure that the variant depth is greater than 1!
+ // TODO -- If we like how this is working and want to apply it to a situation other than the single sample HC pipeline,
+ // TODO -- then we will need to modify the annotateContext() - and related - routines in the VariantAnnotatorEngine
+ // TODO -- so that genotype-level annotations are run first (to generate AD on the samples) and then the site-level
+ // TODO -- annotations must come afterwards (so that QD can use the AD).
+ if ( genotype.hasAD() ) {
+ final int[] AD = genotype.getAD();
+ final int totalADdepth = (int)MathUtils.sum(AD);
+ if ( totalADdepth - AD[0] > 1 )
+ ADrestrictedDepth += totalADdepth;
+ standardDepth += totalADdepth;
+ continue;
+ }
+
+ if (stratifiedContexts!= null && !stratifiedContexts.isEmpty()) {
+ final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
+ if ( context == null )
+ continue;
+ standardDepth += context.getBasePileup().depthOfCoverage();
+
+ } else if (perReadAlleleLikelihoodMap != null) {
+ final PerReadAlleleLikelihoodMap perReadAlleleLikelihoods = perReadAlleleLikelihoodMap.get(genotype.getSampleName());
+ if (perReadAlleleLikelihoods == null || perReadAlleleLikelihoods.isEmpty())
+ continue;
+
+ standardDepth += perReadAlleleLikelihoods.getNumberOfStoredElements();
+ } else if ( genotype.hasDP() ) {
+ standardDepth += genotype.getDP();
+ }
+ }
+
+ // if the AD-restricted depth is a usable value (i.e. not zero), then we should use that one going forward
+ if ( ADrestrictedDepth > 0 )
+ standardDepth = ADrestrictedDepth;
+
+ if ( standardDepth == 0 )
+ return null;
+
+ final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc);
+ // Hack: when refContext == null then we know we are coming from the HaplotypeCaller and do not want to do a
+ // full length-based normalization (because the indel length problem is present only in the UnifiedGenotyper)
+ double QD = -10.0 * vc.getLog10PError() / ((double)standardDepth * indelNormalizationFactor(altAlleleLength, ref != null));
+
+ // Hack: see note in the fixTooHighQD method below
+ QD = fixTooHighQD(QD);
+
+ final Map map = new HashMap<>();
+ map.put(getKeyNames().get(0), String.format("%.2f", QD));
+ return map;
+ }
+
+ /**
+ * Generate the indel normalization factor.
+ *
+ * @param altAlleleLength the average alternate allele length for the call
+ * @param increaseNormalizationAsLengthIncreases should we apply a normalization factor based on the allele length?
+ * @return a possitive double
+ */
+ private double indelNormalizationFactor(final double altAlleleLength, final boolean increaseNormalizationAsLengthIncreases) {
+ return ( increaseNormalizationAsLengthIncreases ? Math.max(altAlleleLength / 3.0, 1.0) : 1.0);
+ }
+
+ /**
+ * The haplotype caller generates very high quality scores when multiple events are on the
+ * same haplotype. This causes some very good variants to have unusually high QD values,
+ * and VQSR will filter these out. This code looks at the QD value, and if it is above
+ * threshold we map it down to the mean high QD value, with some jittering
+ *
+ * // TODO -- remove me when HaplotypeCaller bubble caller is live
+ *
+ * @param QD the raw QD score
+ * @return a QD value
+ */
+ private double fixTooHighQD(final double QD) {
+ if ( QD < MAX_QD_BEFORE_FIXING ) {
+ return QD;
+ } else {
+ return IDEAL_HIGH_QD + GenomeAnalysisEngine.getRandomGenerator().nextGaussian() * JITTER_SIGMA;
+ }
+ }
+
+ private final static double MAX_QD_BEFORE_FIXING = 35;
+ private final static double IDEAL_HIGH_QD = 30;
+ private final static double JITTER_SIGMA = 3;
+
+ public List getKeyNames() { return Arrays.asList("QD"); }
+
+ public List getDescriptions() {
+ return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth"));
+ }
+
+
+}
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java
new file mode 100644
index 000000000..44e44c63b
--- /dev/null
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java
@@ -0,0 +1,119 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.annotator;
+
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.QualityUtils;
+import org.broadinstitute.variant.vcf.VCFConstants;
+import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
+import org.broadinstitute.variant.vcf.VCFStandardHeaderLines;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+
+import java.util.*;
+
+
+/**
+ * Root Mean Square of the mapping quality of the reads across all samples.
+ */
+public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
+
+ public Map annotate(final RefMetaDataTracker tracker,
+ final AnnotatorCompatible walker,
+ final ReferenceContext ref,
+ final Map stratifiedContexts,
+ final VariantContext vc,
+ final Map perReadAlleleLikelihoodMap ) {
+
+ final List qualities = new ArrayList<>();
+ if ( stratifiedContexts != null ) {
+ if ( stratifiedContexts.size() == 0 )
+ return null;
+
+ for ( final Map.Entry sample : stratifiedContexts.entrySet() ) {
+ final AlignmentContext context = sample.getValue();
+ for ( final PileupElement p : context.getBasePileup() )
+ fillMappingQualitiesFromPileup(p.getRead().getMappingQuality(), qualities);
+ }
+ }
+ else if (perReadAlleleLikelihoodMap != null) {
+ if ( perReadAlleleLikelihoodMap.size() == 0 )
+ return null;
+
+ for ( final PerReadAlleleLikelihoodMap perReadLikelihoods : perReadAlleleLikelihoodMap.values() ) {
+ for ( final GATKSAMRecord read : perReadLikelihoods.getStoredElements() )
+ fillMappingQualitiesFromPileup(read.getMappingQuality(), qualities);
+ }
+ }
+ else
+ return null;
+
+ final double rms = MathUtils.rms(qualities);
+ return Collections.singletonMap(getKeyNames().get(0), (Object)String.format("%.2f", rms));
+ }
+
+ private static void fillMappingQualitiesFromPileup(final int mq, final List qualities) {
+ if ( mq != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) {
+ qualities.add(mq);
+ }
+ }
+
+ public List getKeyNames() { return Arrays.asList(VCFConstants.RMS_MAPPING_QUALITY_KEY); }
+
+ public List getDescriptions() {
+ return Arrays.asList(VCFStandardHeaderLines.getInfoLine(getKeyNames().get(0)));
+ }
+}
\ No newline at end of file
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java
new file mode 100644
index 000000000..13211c44c
--- /dev/null
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java
@@ -0,0 +1,264 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.annotator;
+
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
+import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.sting.utils.MannWhitneyU;
+import org.broadinstitute.sting.utils.QualityUtils;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.variant.vcf.VCFHeaderLine;
+import org.broadinstitute.sting.utils.collections.Pair;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.variant.variantcontext.Allele;
+import org.broadinstitute.variant.variantcontext.Genotype;
+import org.broadinstitute.variant.variantcontext.GenotypesContext;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+
+import java.util.*;
+
+
+/**
+ * Abstract root for all RankSum based annotations
+ */
+public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation {
+ static final boolean DEBUG = false;
+ private boolean useDithering = true;
+
+ public Map annotate(final RefMetaDataTracker tracker,
+ final AnnotatorCompatible walker,
+ final ReferenceContext ref,
+ final Map stratifiedContexts,
+ final VariantContext vc,
+ final Map stratifiedPerReadAlleleLikelihoodMap) {
+ // either stratifiedContexts or stratifiedPerReadAlleleLikelihoodMap has to be non-null
+
+ final GenotypesContext genotypes = vc.getGenotypes();
+ if (genotypes == null || genotypes.size() == 0)
+ return null;
+
+ final ArrayList refQuals = new ArrayList<>();
+ final ArrayList altQuals = new ArrayList<>();
+
+ for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) {
+
+ boolean usePileup = true;
+
+ if ( stratifiedPerReadAlleleLikelihoodMap != null ) {
+ final PerReadAlleleLikelihoodMap likelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName());
+ if ( likelihoodMap != null && !likelihoodMap.isEmpty() ) {
+ fillQualsFromLikelihoodMap(vc.getAlleles(), vc.getStart(), likelihoodMap, refQuals, altQuals);
+ usePileup = false;
+ }
+ }
+
+ // the old UG SNP-only path through the annotations
+ if ( usePileup && stratifiedContexts != null ) {
+ final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
+ if ( context != null ) {
+ final ReadBackedPileup pileup = context.getBasePileup();
+ if ( pileup != null )
+ fillQualsFromPileup(vc.getAlleles(), pileup, refQuals, altQuals);
+ }
+ }
+ }
+
+ if ( refQuals.isEmpty() && altQuals.isEmpty() )
+ return null;
+
+ final MannWhitneyU mannWhitneyU = new MannWhitneyU(useDithering);
+ for (final Double qual : altQuals) {
+ mannWhitneyU.add(qual, MannWhitneyU.USet.SET1);
+ }
+ for (final Double qual : refQuals) {
+ mannWhitneyU.add(qual, MannWhitneyU.USet.SET2);
+ }
+
+ if (DEBUG) {
+ System.out.format("%s, REF QUALS:", this.getClass().getName());
+ for (final Double qual : refQuals)
+ System.out.format("%4.1f ", qual);
+ System.out.println();
+ System.out.format("%s, ALT QUALS:", this.getClass().getName());
+ for (final Double qual : altQuals)
+ System.out.format("%4.1f ", qual);
+ System.out.println();
+
+ }
+ // we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases)
+ final Pair testResults = mannWhitneyU.runOneSidedTest(MannWhitneyU.USet.SET1);
+
+ final Map map = new HashMap<>();
+ if (!Double.isNaN(testResults.first))
+ map.put(getKeyNames().get(0), String.format("%.3f", testResults.first));
+ return map;
+ }
+
+ private void fillQualsFromPileup(final List alleles,
+ final ReadBackedPileup pileup,
+ final List refQuals,
+ final List altQuals) {
+ for ( final PileupElement p : pileup ) {
+ if ( isUsableBase(p) ) {
+ final Double value = getElementForPileupElement(p);
+ if ( value == null )
+ continue;
+
+ if ( alleles.get(0).equals(Allele.create(p.getBase(), true)) )
+ refQuals.add(value);
+ else if ( alleles.contains(Allele.create(p.getBase())) )
+ altQuals.add(value);
+ }
+ }
+ }
+
+ private void fillQualsFromLikelihoodMap(final List alleles,
+ final int refLoc,
+ final PerReadAlleleLikelihoodMap likelihoodMap,
+ final List refQuals,
+ final List altQuals) {
+ for ( final Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet() ) {
+ final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
+ if ( ! a.isInformative() )
+ continue; // read is non-informative
+
+ final GATKSAMRecord read = el.getKey();
+ if ( isUsableRead(read, refLoc) ) {
+ final Double value = getElementForRead(read, refLoc, a);
+ if ( value == null )
+ continue;
+
+ if ( a.getMostLikelyAllele().isReference() )
+ refQuals.add(value);
+ else if ( alleles.contains(a.getMostLikelyAllele()) )
+ altQuals.add(value);
+ }
+ }
+ }
+
+ /**
+ * Get the element for the given read at the given reference position
+ *
+ * @param read the read
+ * @param refLoc the reference position
+ * @param mostLikelyAllele the most likely allele for this read
+ * @return a Double representing the element to be used in the rank sum test, or null if it should not be used
+ */
+ protected Double getElementForRead(final GATKSAMRecord read, final int refLoc, final MostLikelyAllele mostLikelyAllele) {
+ return getElementForRead(read, refLoc);
+ }
+
+ /**
+ * Get the element for the given read at the given reference position
+ *
+ * @param read the read
+ * @param refLoc the reference position
+ * @return a Double representing the element to be used in the rank sum test, or null if it should not be used
+ */
+ protected abstract Double getElementForRead(final GATKSAMRecord read, final int refLoc);
+
+ // TODO -- until the ReadPosRankSumTest stops treating these differently, we need to have separate methods for GATKSAMRecords and PileupElements. Yuck.
+
+ /**
+ * Get the element for the given read at the given reference position
+ *
+ * By default this function returns null, indicating that the test doesn't support the old style of pileup calculations
+ *
+ * @param p the pileup element
+ * @return a Double representing the element to be used in the rank sum test, or null if it should not be used
+ */
+ protected Double getElementForPileupElement(final PileupElement p) {
+ // does not work in pileup mode
+ return null;
+ }
+
+ /**
+ * Can the base in this pileup element be used in comparative tests between ref / alt bases?
+ *
+ * Note that this function by default does not allow deletion pileup elements
+ *
+ * @param p the pileup element to consider
+ * @return true if this base is part of a meaningful read for comparison, false otherwise
+ */
+ protected boolean isUsableBase(final PileupElement p) {
+ return !(p.isDeletion() ||
+ p.getMappingQual() == 0 ||
+ p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ||
+ ((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); // need the unBAQed quality score here
+ }
+
+ /**
+ * Can the read be used in comparative tests between ref / alt bases?
+ *
+ * @param read the read to consider
+ * @param refLoc the reference location
+ * @return true if this read is meaningful for comparison, false otherwise
+ */
+ protected boolean isUsableRead(final GATKSAMRecord read, final int refLoc) {
+ return !( read.getMappingQuality() == 0 ||
+ read.getMappingQuality() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE );
+ }
+
+ /**
+ * Initialize the rank sum test annotation using walker and engine information. Right now this checks to see if
+ * engine randomization is turned off, and if so does not dither.
+ * @param walker the walker
+ * @param toolkit the GATK engine
+ * @param headerLines the header lines
+ */
+ public void initialize ( AnnotatorCompatible walker, GenomeAnalysisEngine toolkit, Set headerLines ) {
+ useDithering = ! toolkit.getArguments().disableDithering;
+ }
+}
\ No newline at end of file
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java
new file mode 100644
index 000000000..417f3b595
--- /dev/null
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java
@@ -0,0 +1,105 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.annotator;
+
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.variant.vcf.VCFHeaderLineType;
+import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+
+/**
+ * Fraction of reads containing spanning deletions at this site
+ *
+ *
Note that this annotation is currently not compatible with HaplotypeCaller.
+ */
+public class SpanningDeletions extends InfoFieldAnnotation implements StandardAnnotation {
+
+ public Map annotate(final RefMetaDataTracker tracker,
+ final AnnotatorCompatible walker,
+ final ReferenceContext ref,
+ final Map stratifiedContexts,
+ final VariantContext vc,
+ final Map stratifiedPerReadAlleleLikelihoodMap) {
+ if ( stratifiedContexts.size() == 0 )
+ return null;
+
+ // not meaningful when we're at an indel location: deletions that start at location N are by definition called at the position N-1, and at position N-1
+ // there are no informative deletions in the pileup
+ if (!vc.isSNP())
+ return null;
+
+ int deletions = 0;
+ int depth = 0;
+ for ( Map.Entry sample : stratifiedContexts.entrySet() ) {
+ for ( final PileupElement p : sample.getValue().getBasePileup() ) {
+ depth++;
+ if ( p.isDeletion() )
+ deletions++;
+ }
+ }
+ Map map = new HashMap();
+ map.put(getKeyNames().get(0), String.format("%.2f", depth == 0 ? 0.0 : (double)deletions/(double)depth));
+ return map;
+ }
+
+ public List getKeyNames() { return Arrays.asList("Dels"); }
+
+ public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("Dels", 1, VCFHeaderLineType.Float, "Fraction of Reads Containing Spanning Deletions")); }
+}
\ No newline at end of file
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java
new file mode 100644
index 000000000..ec1c1e729
--- /dev/null
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java
@@ -0,0 +1,99 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.annotator;
+
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.variant.variantcontext.Genotype;
+import org.broadinstitute.variant.variantcontext.GenotypeBuilder;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+import org.broadinstitute.variant.vcf.VCFFormatHeaderLine;
+import org.broadinstitute.variant.vcf.VCFHeaderLineType;
+
+import java.util.*;
+
+/**
+ * Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias
+ * User: rpoplin
+ * Date: 8/28/13
+ */
+
+public class StrandBiasBySample extends GenotypeAnnotation {
+
+ public final static String STRAND_BIAS_BY_SAMPLE_KEY_NAME = "SB";
+
+ @Override
+ public void annotate(final RefMetaDataTracker tracker,
+ final AnnotatorCompatible walker,
+ final ReferenceContext ref,
+ final AlignmentContext stratifiedContext,
+ final VariantContext vc,
+ final Genotype g,
+ final GenotypeBuilder gb,
+ final PerReadAlleleLikelihoodMap alleleLikelihoodMap) {
+ if ( ! isAppropriateInput(alleleLikelihoodMap, g) )
+ return;
+
+ final int[][] table = FisherStrand.getContingencyTable(Collections.singletonMap(g.getSampleName(), alleleLikelihoodMap), vc);
+
+ gb.attribute(STRAND_BIAS_BY_SAMPLE_KEY_NAME, FisherStrand.getContingencyArray(table));
+ }
+
+ @Override
+ public List getKeyNames() { return Collections.singletonList(STRAND_BIAS_BY_SAMPLE_KEY_NAME); }
+
+ @Override
+ public List getDescriptions() { return Collections.singletonList(new VCFFormatHeaderLine(getKeyNames().get(0), 4, VCFHeaderLineType.Integer, "Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.")); }
+
+ private boolean isAppropriateInput(final PerReadAlleleLikelihoodMap map, final Genotype g) {
+ return ! (map == null || g == null || !g.isCalled());
+ }
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantType.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantType.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantType.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantType.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AnalyzeCovariates.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/AnalyzeCovariates.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AnalyzeCovariates.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/AnalyzeCovariates.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java
new file mode 100644
index 000000000..3da04ef86
--- /dev/null
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java
@@ -0,0 +1,536 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.bqsr;
+
+import net.sf.picard.reference.IndexedFastaSequenceFile;
+import net.sf.samtools.CigarElement;
+import net.sf.samtools.SAMFileHeader;
+import org.broad.tribble.Feature;
+import org.broadinstitute.sting.commandline.Advanced;
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.ArgumentCollection;
+import org.broadinstitute.sting.gatk.CommandLineGATK;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.filters.*;
+import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.*;
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.baq.BAQ;
+import org.broadinstitute.sting.utils.clipping.ReadClipper;
+import org.broadinstitute.sting.utils.collections.Pair;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
+import org.broadinstitute.sting.utils.help.HelpConstants;
+import org.broadinstitute.sting.utils.recalibration.*;
+import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.sam.ReadUtils;
+
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context).
+ *
+ *
+ * This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating
+ * only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative
+ * of poor base quality. This walker generates tables based on various user-specified covariates (such as read group,
+ * reported quality score, cycle, and context). Since there is a large amount of data one can then calculate an empirical
+ * probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations.
+ * The output file is a table (of the several covariate values, num observations, num mismatches, empirical quality score).
+ *
+ * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified.
+ *
+ *
+ *
+ *
Input
+ *
+ * The input read data whose base quality scores need to be assessed.
+ *
+ * A database of known polymorphic sites to skip over.
+ *
+ *
+ *
Output
+ *
+ * A GATK Report file with many tables:
+ *
+ *
The list of arguments
+ *
The quantized qualities table
+ *
The recalibration table by read group
+ *
The recalibration table by quality score
+ *
The recalibration table for all the optional covariates
+ *
+ *
+ * The GATK Report is intended to be easy to read by humans or computers. Check out the documentation of the GATKReport to learn how to manipulate this table.
+ *
+ *
+ *
+ */
+
+@DocumentedGATKFeature(groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class})
+@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN)
+@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class})
+@PartitionBy(PartitionType.READ)
+public class BaseRecalibrator extends ReadWalker implements NanoSchedulable {
+ /**
+ * all the command line arguments for BQSR and it's covariates
+ */
+ @ArgumentCollection
+ private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
+
+ /**
+ * When you have nct > 1, BQSR uses nct times more memory to compute its recalibration tables, for efficiency
+ * purposes. If you have many covariates, and therefore are using a lot of memory, you can use this flag
+ * to safely access only one table. There may be some CPU cost, but as long as the table is really big
+ * there should be relatively little CPU costs.
+ */
+ @Argument(fullName = "lowMemoryMode", shortName="lowMemoryMode", doc="Reduce memory usage in multi-threaded code at the expense of threading efficiency", required = false)
+ public boolean lowMemoryMode = false;
+
+ @Advanced
+ @Argument(fullName = "bqsrBAQGapOpenPenalty", shortName="bqsrBAQGOP", doc="BQSR BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false)
+ public double BAQGOP = BAQ.DEFAULT_GOP;
+
+ /**
+ * an object that keeps track of the information necessary for quality score quantization
+ */
+ private QuantizationInfo quantizationInfo;
+
+ /**
+ * list to hold the all the covariate objects that were requested (required + standard + experimental)
+ */
+ private Covariate[] requestedCovariates;
+
+ private RecalibrationEngine recalibrationEngine;
+
+ private int minimumQToUse;
+
+ private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation.";
+
+ private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector
+ private IndexedFastaSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation
+ private final static byte NO_BAQ_UNCERTAINTY = (byte)'@';
+
+ /**
+ * Parse the -cov arguments and create a list of covariates to be used here
+ * Based on the covariates' estimates for initial capacity allocate the data hashmap
+ */
+ public void initialize() {
+ baq = new BAQ(BAQGOP); // setup the BAQ object with the provided gap open penalty
+
+ if (RAC.FORCE_PLATFORM != null)
+ RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
+
+ if (RAC.knownSites.isEmpty() && !RAC.RUN_WITHOUT_DBSNP) // Warn the user if no dbSNP file or other variant mask was specified
+ throw new UserException.CommandLineException(NO_DBSNP_EXCEPTION);
+
+ if (RAC.LIST_ONLY) {
+ RecalUtils.listAvailableCovariates(logger);
+ System.exit(0);
+ }
+ RAC.existingRecalibrationReport = getToolkit().getArguments().BQSR_RECAL_FILE; // if we have a recalibration file, record it so it goes on the report table
+
+ Pair, ArrayList> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates
+ ArrayList requiredCovariates = covariates.getFirst();
+ ArrayList optionalCovariates = covariates.getSecond();
+
+ requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()];
+ int covariateIndex = 0;
+ for (final Covariate covariate : requiredCovariates)
+ requestedCovariates[covariateIndex++] = covariate;
+ for (final Covariate covariate : optionalCovariates)
+ requestedCovariates[covariateIndex++] = covariate;
+
+ logger.info("The covariates being used here: ");
+ for (Covariate cov : requestedCovariates) { // list all the covariates being used
+ logger.info("\t" + cov.getClass().getSimpleName());
+ cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection
+ }
+
+ try {
+ RAC.RECAL_TABLE = new PrintStream(RAC.RECAL_TABLE_FILE);
+ } catch (IOException e) {
+ throw new UserException.CouldNotCreateOutputFile(RAC.RECAL_TABLE_FILE, e);
+ }
+
+ initializeRecalibrationEngine();
+ minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN;
+ referenceReader = getToolkit().getReferenceDataSource().getReference();
+ }
+
+ /**
+ * Initialize the recalibration engine
+ */
+ private void initializeRecalibrationEngine() {
+ int numReadGroups = 0;
+ for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() )
+ numReadGroups += header.getReadGroups().size();
+
+ recalibrationEngine = new RecalibrationEngine(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG, lowMemoryMode);
+ }
+
+ private boolean isLowQualityBase( final GATKSAMRecord read, final int offset ) {
+ return read.getBaseQualities()[offset] < minimumQToUse;
+ }
+
+ /**
+ * For each read at this locus get the various covariate values and increment that location in the map based on
+ * whether or not the base matches the reference at this particular location
+ */
+ public Long map( final ReferenceContext ref, final GATKSAMRecord originalRead, final RefMetaDataTracker metaDataTracker ) {
+
+ final GATKSAMRecord read = ReadClipper.hardClipSoftClippedBases( ReadClipper.hardClipAdaptorSequence(originalRead) );
+ if( read.isEmpty() ) { return 0L; } // the whole read was inside the adaptor so skip it
+
+ RecalUtils.parsePlatformForRead(read, RAC);
+ if (!RecalUtils.isColorSpaceConsistent(RAC.SOLID_NOCALL_STRATEGY, read)) { // parse the solid color space and check for color no-calls
+ return 0L; // skip this read completely
+ }
+
+ final int[] isSNP = calculateIsSNP(read, ref, originalRead);
+ final int[] isInsertion = calculateIsIndel(read, EventType.BASE_INSERTION);
+ final int[] isDeletion = calculateIsIndel(read, EventType.BASE_DELETION);
+ final int nErrors = nEvents(isSNP, isInsertion, isDeletion);
+
+ // note for efficiency regions we don't compute the BAQ array unless we actually have
+ // some error to marginalize over. For ILMN data ~85% of reads have no error
+ final byte[] baqArray = nErrors == 0 ? flatBAQArray(read) : calculateBAQArray(read);
+
+ if( baqArray != null ) { // some reads just can't be BAQ'ed
+ final ReadCovariates covariates = RecalUtils.computeCovariates(read, requestedCovariates);
+ final boolean[] skip = calculateSkipArray(read, metaDataTracker); // skip known sites of variation as well as low quality and non-regular bases
+ final double[] snpErrors = calculateFractionalErrorArray(isSNP, baqArray);
+ final double[] insertionErrors = calculateFractionalErrorArray(isInsertion, baqArray);
+ final double[] deletionErrors = calculateFractionalErrorArray(isDeletion, baqArray);
+
+ // aggregate all of the info into our info object, and update the data
+ final ReadRecalibrationInfo info = new ReadRecalibrationInfo(read, covariates, skip, snpErrors, insertionErrors, deletionErrors);
+ recalibrationEngine.updateDataForRead(info);
+ return 1L;
+ } else {
+ return 0L;
+ }
+ }
+
+ /**
+ * Compute the number of mutational events across all hasEvent vectors
+ *
+ * Simply the sum of entries in hasEvents
+ *
+ * @param hasEvents a vector a vectors of 0 (no event) and 1 (has event)
+ * @return the total number of events across all hasEvent arrays
+ */
+ protected static int nEvents(final int[]... hasEvents) {
+ int n = 0;
+ for ( final int[] hasEvent : hasEvents ) {
+ n += MathUtils.sum(hasEvent);
+ }
+ return n;
+ }
+
+ protected boolean[] calculateSkipArray( final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker ) {
+ final byte[] bases = read.getReadBases();
+ final boolean[] skip = new boolean[bases.length];
+ final boolean[] knownSites = calculateKnownSites(read, metaDataTracker.getValues(RAC.knownSites));
+ for( int iii = 0; iii < bases.length; iii++ ) {
+ skip[iii] = !BaseUtils.isRegularBase(bases[iii]) || isLowQualityBase(read, iii) || knownSites[iii] || badSolidOffset(read, iii);
+ }
+ return skip;
+ }
+
+ protected boolean badSolidOffset( final GATKSAMRecord read, final int offset ) {
+ return ReadUtils.isSOLiDRead(read) && RAC.SOLID_RECAL_MODE != RecalUtils.SOLID_RECAL_MODE.DO_NOTHING && !RecalUtils.isColorSpaceConsistent(read, offset);
+ }
+
+ protected static boolean[] calculateKnownSites( final GATKSAMRecord read, final List features ) {
+ final int readLength = read.getReadBases().length;
+ final boolean[] knownSites = new boolean[readLength];
+ Arrays.fill(knownSites, false);
+ for( final Feature f : features ) {
+ int featureStartOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getStart(), ReadUtils.ClippingTail.LEFT_TAIL, true); // BUGBUG: should I use LEFT_TAIL here?
+ if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
+ featureStartOnRead = 0;
+ }
+
+ int featureEndOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getEnd(), ReadUtils.ClippingTail.LEFT_TAIL, true);
+ if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
+ featureEndOnRead = readLength;
+ }
+
+ if( featureStartOnRead > readLength ) {
+ featureStartOnRead = featureEndOnRead = readLength;
+ }
+
+ Arrays.fill(knownSites, Math.max(0, featureStartOnRead), Math.min(readLength, featureEndOnRead + 1), true);
+ }
+ return knownSites;
+ }
+
+ // BUGBUG: can be merged with calculateIsIndel
+ protected static int[] calculateIsSNP( final GATKSAMRecord read, final ReferenceContext ref, final GATKSAMRecord originalRead ) {
+ final byte[] readBases = read.getReadBases();
+ final byte[] refBases = Arrays.copyOfRange(ref.getBases(), read.getAlignmentStart() - originalRead.getAlignmentStart(), ref.getBases().length + read.getAlignmentEnd() - originalRead.getAlignmentEnd());
+ final int[] snp = new int[readBases.length];
+ int readPos = 0;
+ int refPos = 0;
+ for ( final CigarElement ce : read.getCigar().getCigarElements() ) {
+ final int elementLength = ce.getLength();
+ switch (ce.getOperator()) {
+ case M:
+ case EQ:
+ case X:
+ for( int iii = 0; iii < elementLength; iii++ ) {
+ snp[readPos] = ( BaseUtils.basesAreEqual(readBases[readPos], refBases[refPos]) ? 0 : 1 );
+ readPos++;
+ refPos++;
+ }
+ break;
+ case D:
+ case N:
+ refPos += elementLength;
+ break;
+ case I:
+ case S: // ReferenceContext doesn't have the soft clipped bases!
+ readPos += elementLength;
+ break;
+ case H:
+ case P:
+ break;
+ default:
+ throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator());
+ }
+ }
+ return snp;
+ }
+
+ protected static int[] calculateIsIndel( final GATKSAMRecord read, final EventType mode ) {
+ final int[] indel = new int[read.getReadBases().length];
+ int readPos = 0;
+ for ( final CigarElement ce : read.getCigar().getCigarElements() ) {
+ final int elementLength = ce.getLength();
+ switch (ce.getOperator()) {
+ case M:
+ case EQ:
+ case X:
+ case S:
+ {
+ readPos += elementLength;
+ break;
+ }
+ case D:
+ {
+ final int index = ( read.getReadNegativeStrandFlag() ? readPos : readPos - 1 );
+ updateIndel(indel, index, mode, EventType.BASE_DELETION);
+ break;
+ }
+ case I:
+ {
+ final boolean forwardStrandRead = !read.getReadNegativeStrandFlag();
+ if( forwardStrandRead ) {
+ updateIndel(indel, readPos - 1, mode, EventType.BASE_INSERTION);
+ }
+ readPos += elementLength;
+ if( !forwardStrandRead ) {
+ updateIndel(indel, readPos, mode, EventType.BASE_INSERTION);
+ }
+ break;
+ }
+ case N:
+ case H:
+ case P:
+ break;
+ default:
+ throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator());
+ }
+ }
+ return indel;
+ }
+
+ private static void updateIndel(final int[] indel, final int index, final EventType mode, final EventType requiredMode) {
+ if ( mode == requiredMode && index >= 0 && index < indel.length )
+ // protect ourselves from events at the start or end of the read (1D3M or 3M1D)
+ indel[index] = 1;
+ }
+
+ protected static double[] calculateFractionalErrorArray( final int[] errorArray, final byte[] baqArray ) {
+ if(errorArray.length != baqArray.length ) {
+ throw new ReviewedStingException("Array length mismatch detected. Malformed read?");
+ }
+
+ final int BLOCK_START_UNSET = -1;
+
+ final double[] fractionalErrors = new double[baqArray.length];
+ Arrays.fill(fractionalErrors, 0.0);
+ boolean inBlock = false;
+ int blockStartIndex = BLOCK_START_UNSET;
+ int iii;
+ for( iii = 0; iii < fractionalErrors.length; iii++ ) {
+ if( baqArray[iii] == NO_BAQ_UNCERTAINTY ) {
+ if( !inBlock ) {
+ fractionalErrors[iii] = (double) errorArray[iii];
+ } else {
+ calculateAndStoreErrorsInBlock(iii, blockStartIndex, errorArray, fractionalErrors);
+ inBlock = false; // reset state variables
+ blockStartIndex = BLOCK_START_UNSET; // reset state variables
+ }
+ } else {
+ inBlock = true;
+ if( blockStartIndex == BLOCK_START_UNSET ) { blockStartIndex = iii; }
+ }
+ }
+ if( inBlock ) {
+ calculateAndStoreErrorsInBlock(iii-1, blockStartIndex, errorArray, fractionalErrors);
+ }
+ if( fractionalErrors.length != errorArray.length ) {
+ throw new ReviewedStingException("Output array length mismatch detected. Malformed read?");
+ }
+ return fractionalErrors;
+ }
+
+ private static void calculateAndStoreErrorsInBlock( final int iii,
+ final int blockStartIndex,
+ final int[] errorArray,
+ final double[] fractionalErrors ) {
+ int totalErrors = 0;
+ for( int jjj = Math.max(0,blockStartIndex-1); jjj <= iii; jjj++ ) {
+ totalErrors += errorArray[jjj];
+ }
+ for( int jjj = Math.max(0, blockStartIndex-1); jjj <= iii; jjj++ ) {
+ fractionalErrors[jjj] = ((double) totalErrors) / ((double)(iii - Math.max(0,blockStartIndex-1) + 1));
+ }
+ }
+
+ /**
+ * Create a BAQ style array that indicates no alignment uncertainty
+ * @param read the read for which we want a BAQ array
+ * @return a BAQ-style non-null byte[] counting NO_BAQ_UNCERTAINTY values
+ * // TODO -- could be optimized avoiding this function entirely by using this inline if the calculation code above
+ */
+ protected static byte[] flatBAQArray(final GATKSAMRecord read) {
+ final byte[] baq = new byte[read.getReadLength()];
+ Arrays.fill(baq, NO_BAQ_UNCERTAINTY);
+ return baq;
+ }
+
+ /**
+ * Compute an actual BAQ array for read, based on its quals and the reference sequence
+ * @param read the read to BAQ
+ * @return a non-null BAQ tag array for read
+ */
+ private byte[] calculateBAQArray( final GATKSAMRecord read ) {
+ baq.baqRead(read, referenceReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.ADD_TAG);
+ return BAQ.getBAQTag(read);
+ }
+
+ /**
+ * Initialize the reduce step by returning 0L
+ *
+ * @return returns 0L
+ */
+ public Long reduceInit() {
+ return 0L;
+ }
+
+ /**
+ * The Reduce method doesn't do anything for this walker.
+ *
+ * @param mapped Result of the map. This value is immediately ignored.
+ * @param sum The summing CountedData used to output the CSV data
+ * @return returns The sum used to output the CSV data
+ */
+ public Long reduce(Long mapped, Long sum) {
+ sum += mapped;
+ return sum;
+ }
+
+ @Override
+ public void onTraversalDone(Long result) {
+ recalibrationEngine.finalizeData();
+
+ logger.info("Calculating quantized quality scores...");
+ quantizeQualityScores();
+
+ logger.info("Writing recalibration report...");
+ generateReport();
+ logger.info("...done!");
+
+ logger.info("BaseRecalibrator was able to recalibrate " + result + " reads");
+ }
+
+ private RecalibrationTables getRecalibrationTable() {
+ return recalibrationEngine.getFinalRecalibrationTables();
+ }
+
+ /**
+ * go through the quality score table and use the # observations and the empirical quality score
+ * to build a quality score histogram for quantization. Then use the QuantizeQual algorithm to
+ * generate a quantization map (recalibrated_qual -> quantized_qual)
+ */
+ private void quantizeQualityScores() {
+ quantizationInfo = new QuantizationInfo(getRecalibrationTable(), RAC.QUANTIZING_LEVELS);
+ }
+
+ private void generateReport() {
+ RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, getRecalibrationTable(), requestedCovariates, RAC.SORT_BY_ALL_COLUMNS);
+ }
+}
\ No newline at end of file
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistribution.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistribution.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistribution.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/BaseCoverageDistribution.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/AbstractStratification.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/CallableStatus.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/CallableStatus.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/CallableStatus.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/CallableStatus.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalMetric.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalMetric.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalMetric.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalMetric.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetric.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetric.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetric.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetric.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricCoverageGap.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricCoverageGap.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricCoverageGap.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricCoverageGap.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricExcessiveCoverage.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricExcessiveCoverage.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricExcessiveCoverage.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricExcessiveCoverage.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricLowCoverage.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricLowCoverage.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricLowCoverage.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricLowCoverage.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricPoorQuality.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricPoorQuality.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricPoorQuality.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusMetricPoorQuality.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/LocusStratification.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Metric.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Metric.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Metric.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/Metric.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/PluginUtils.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetric.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetric.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetric.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetric.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricBadMates.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricBadMates.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricBadMates.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricBadMates.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricNoReads.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricNoReads.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricNoReads.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleMetricNoReads.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/SampleStratification.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java
new file mode 100644
index 000000000..7457acb22
--- /dev/null
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java
@@ -0,0 +1,292 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.clipping.ReadClipper;
+import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
+import org.broadinstitute.sting.utils.collections.Pair;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.sam.ReadUtils;
+import org.broadinstitute.variant.variantcontext.*;
+
+import java.util.*;
+
+/**
+ * Code for determining which indels are segregating among the samples.
+ *
+ * This code is just a refactor of the original code from Guillermo in the UG.
+ *
+ * @author Mark DePristo
+ * @since 3/26/12
+ */
+public class ConsensusAlleleCounter {
+ final protected static Logger logger = Logger.getLogger(ConsensusAlleleCounter.class);
+ private final int minIndelCountForGenotyping;
+ private final boolean doMultiAllelicCalls;
+ private final double minFractionInOneSample;
+
+ public ConsensusAlleleCounter(final boolean doMultiAllelicCalls,
+ final int minIndelCountForGenotyping,
+ final double minFractionInOneSample) {
+ this.minIndelCountForGenotyping = minIndelCountForGenotyping;
+ this.doMultiAllelicCalls = doMultiAllelicCalls;
+ this.minFractionInOneSample = minFractionInOneSample;
+ }
+
+ /**
+ * Returns a list of Alleles at this locus that may be segregating
+ *
+ * @param ref
+ * @param contexts
+ * @param contextType
+ * @return
+ */
+ public List computeConsensusAlleles(ReferenceContext ref,
+ Map contexts,
+ AlignmentContextUtils.ReadOrientation contextType) {
+ final Map consensusIndelStrings = countConsensusAlleles(ref, contexts, contextType);
+ return consensusCountsToAlleles(ref, consensusIndelStrings);
+ }
+
+ //
+ // TODO -- WARNING DOESN'T WORK WITH REDUCED READS
+ //
+ private Map countConsensusAlleles(ReferenceContext ref,
+ Map contexts,
+ AlignmentContextUtils.ReadOrientation contextType) {
+ final GenomeLoc loc = ref.getLocus();
+ HashMap consensusIndelStrings = new HashMap();
+
+ int insCount = 0, delCount = 0;
+ // quick check of total number of indels in pileup
+ for ( Map.Entry sample : contexts.entrySet() ) {
+ final AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
+
+ final ReadBackedPileup indelPileup = context.getBasePileup();
+ insCount += indelPileup.getNumberOfInsertionsAfterThisElement();
+ delCount += indelPileup.getNumberOfDeletionsAfterThisElement();
+ }
+
+ if ( insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping )
+ return Collections.emptyMap();
+
+ for (Map.Entry sample : contexts.entrySet()) {
+ // todo -- warning, can be duplicating expensive partition here
+ AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
+
+ final ReadBackedPileup indelPileup = context.getBasePileup();
+
+ final int nIndelReads = indelPileup.getNumberOfInsertionsAfterThisElement() + indelPileup.getNumberOfDeletionsAfterThisElement();
+ final int nReadsOverall = indelPileup.getNumberOfElements();
+
+ if ( nIndelReads == 0 || (nIndelReads / (1.0 * nReadsOverall)) < minFractionInOneSample ) {
+ continue;
+ }
+
+ for (PileupElement p : indelPileup) {
+ final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
+ if (read == null)
+ continue;
+ if (ReadUtils.is454Read(read)) {
+ continue;
+ }
+
+ if ( p.isBeforeInsertion() ) {
+ final String insertionBases = p.getBasesOfImmediatelyFollowingInsertion();
+ // edge case: ignore a deletion immediately preceding an insertion as p.getBasesOfImmediatelyFollowingInsertion() returns null [EB]
+ if ( insertionBases == null )
+ continue;
+
+ boolean foundKey = false;
+ // copy of hashmap into temp arrayList
+ ArrayList> cList = new ArrayList>();
+ for (Map.Entry s : consensusIndelStrings.entrySet()) {
+ cList.add(new Pair(s.getKey(), s.getValue()));
+ }
+
+ if (read.getAlignmentEnd() == loc.getStart()) {
+ // first corner condition: a read has an insertion at the end, and we're right at the insertion.
+ // In this case, the read could have any of the inserted bases and we need to build a consensus
+
+ for (int k=0; k < cList.size(); k++) {
+ String s = cList.get(k).getFirst();
+ int cnt = cList.get(k).getSecond();
+ // case 1: current insertion is prefix of indel in hash map
+ if (s.startsWith(insertionBases)) {
+ cList.set(k,new Pair(s,cnt+1));
+ foundKey = true;
+ }
+ else if (insertionBases.startsWith(s)) {
+ // case 2: indel stored in hash table is prefix of current insertion
+ // In this case, new bases are new key.
+ foundKey = true;
+ cList.set(k,new Pair(insertionBases,cnt+1));
+ }
+ }
+ if (!foundKey)
+ // none of the above: event bases not supported by previous table, so add new key
+ cList.add(new Pair(insertionBases,1));
+
+ }
+ else if (read.getAlignmentStart() == loc.getStart()+1) {
+ // opposite corner condition: read will start at current locus with an insertion
+ for (int k=0; k < cList.size(); k++) {
+ String s = cList.get(k).getFirst();
+ int cnt = cList.get(k).getSecond();
+ if (s.endsWith(insertionBases)) {
+ // case 1: current insertion (indelString) is suffix of indel in hash map (s)
+ cList.set(k,new Pair(s,cnt+1));
+ foundKey = true;
+ }
+ else if (insertionBases.endsWith(s)) {
+ // case 2: indel stored in hash table is prefix of current insertion
+ // In this case, new bases are new key.
+ foundKey = true;
+ cList.set(k,new Pair(insertionBases,cnt+1));
+ }
+ }
+ if (!foundKey)
+ // none of the above: event bases not supported by previous table, so add new key
+ cList.add(new Pair(insertionBases,1));
+
+
+ }
+ else {
+ // normal case: insertion somewhere in the middle of a read: add count to arrayList
+ int cnt = consensusIndelStrings.containsKey(insertionBases)? consensusIndelStrings.get(insertionBases):0;
+ cList.add(new Pair(insertionBases,cnt+1));
+ }
+
+ // copy back arrayList into hashMap
+ consensusIndelStrings.clear();
+ for (Pair pair : cList) {
+ consensusIndelStrings.put(pair.getFirst(),pair.getSecond());
+ }
+
+ }
+ else if ( p.isBeforeDeletionStart() ) {
+ final String deletionString = String.format("D%d",p.getLengthOfImmediatelyFollowingIndel());
+ int cnt = consensusIndelStrings.containsKey(deletionString)? consensusIndelStrings.get(deletionString):0;
+ consensusIndelStrings.put(deletionString,cnt+1);
+ }
+ }
+ }
+
+ return consensusIndelStrings;
+ }
+
+ private List consensusCountsToAlleles(final ReferenceContext ref,
+ final Map consensusIndelStrings) {
+ final GenomeLoc loc = ref.getLocus();
+ final Collection vcs = new ArrayList();
+ int maxAlleleCnt = 0;
+ Allele refAllele, altAllele;
+
+ for (final Map.Entry elt : consensusIndelStrings.entrySet()) {
+ final String s = elt.getKey();
+ final int curCnt = elt.getValue();
+ int stop = 0;
+
+ // if observed count if above minimum threshold, we will genotype this allele
+ if (curCnt < minIndelCountForGenotyping)
+ continue;
+
+ if (s.startsWith("D")) {
+ // get deletion length
+ final int dLen = Integer.valueOf(s.substring(1));
+ // get ref bases of accurate deletion
+ final int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart();
+ stop = loc.getStart() + dLen;
+ final byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference - 1, startIdxInReference + dLen); // add reference padding
+
+ if (Allele.acceptableAlleleBases(refBases, false)) {
+ refAllele = Allele.create(refBases, true);
+ altAllele = Allele.create(ref.getBase(), false);
+ }
+ else continue; // don't go on with this allele if refBases are non-standard
+ } else {
+ // insertion case
+ final String insertionBases = (char)ref.getBase() + s; // add reference padding
+ if (Allele.acceptableAlleleBases(insertionBases, false)) { // don't allow N's in insertions
+ refAllele = Allele.create(ref.getBase(), true);
+ altAllele = Allele.create(insertionBases, false);
+ stop = loc.getStart();
+ }
+ else continue; // go on to next allele if consensus insertion has any non-standard base.
+ }
+
+
+ final VariantContextBuilder builder = new VariantContextBuilder().source("");
+ builder.loc(loc.getContig(), loc.getStart(), stop);
+ builder.alleles(Arrays.asList(refAllele, altAllele));
+ builder.noGenotypes();
+ if (doMultiAllelicCalls) {
+ vcs.add(builder.make());
+ if (vcs.size() >= GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED)
+ break;
+ } else if (curCnt > maxAlleleCnt) {
+ maxAlleleCnt = curCnt;
+ vcs.clear();
+ vcs.add(builder.make());
+ }
+ }
+
+ if (vcs.isEmpty())
+ return Collections.emptyList(); // nothing else to do, no alleles passed minimum count criterion
+
+ final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(vcs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false);
+ return mergedVC.getAlleles();
+ }
+}
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java
new file mode 100644
index 000000000..77c51f88b
--- /dev/null
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java
@@ -0,0 +1,500 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import net.sf.samtools.SAMUtils;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.QualityUtils;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.fragments.FragmentCollection;
+import org.broadinstitute.sting.utils.fragments.FragmentUtils;
+import org.broadinstitute.sting.utils.genotyper.DiploidGenotype;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+
+import java.util.List;
+
+import static java.lang.Math.log10;
+import static java.lang.Math.pow;
+
+/**
+ * Stable, error checking version of the Bayesian genotyper. Useful for calculating the likelihoods, priors,
+ * and posteriors given a pile of bases and quality scores
+ *
+ * Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object
+ * calculates:
+ *
+ * P(G | D) = P(G) * P(D | G)
+ *
+ * where
+ *
+ * P(D | G) = sum_i log10 P(bi | G)
+ *
+ * and
+ *
+ * P(bi | G) = 1 - P(error | q1) if bi is in G
+ * = P(error | q1) / 3 if bi is not in G
+ *
+ * for homozygous genotypes and for heterozygous genotypes:
+ *
+ * P(bi | G) = 1 - P(error | q1) / 2 + P(error | q1) / 6 if bi is in G
+ * = P(error | q1) / 3 if bi is not in G
+ *
+ * for each of the 10 unique diploid genotypes AA, AC, AG, .., TT
+ *
+ * Everything is stored as arrays indexed by DiploidGenotype.ordinal() values in log10 space.
+ *
+ * The priors contain the relative probabilities of each genotype, and must be provided at object creation.
+ * From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above
+ * model.
+ */
+public class DiploidSNPGenotypeLikelihoods implements Cloneable {
+
+ public final static double DEFAULT_PCR_ERROR_RATE = FragmentUtils.DEFAULT_PCR_ERROR_RATE;
+
+ protected final static int FIXED_PLOIDY = 2;
+ protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1;
+ protected final static double ploidyAdjustment = log10(FIXED_PLOIDY);
+ protected final static double log10_3 = log10(3.0);
+
+ protected boolean VERBOSE = false;
+
+ //
+ // The fundamental data arrays associated with a Genotype Likelihoods object
+ //
+ protected double[] log10Likelihoods = null;
+
+ // TODO: don't calculate this each time through
+ protected double log10_PCR_error_3;
+ protected double log10_1_minus_PCR_error;
+
+ /**
+ * Create a new GenotypeLikelhoods object with given PCR error rate for each diploid genotype
+ *
+ * @param PCR_error_rate the PCR error rate
+ */
+ public DiploidSNPGenotypeLikelihoods(double PCR_error_rate) {
+ log10_PCR_error_3 = log10(PCR_error_rate) - log10_3;
+ log10_1_minus_PCR_error = log10(1.0 - PCR_error_rate);
+ setToZero();
+ }
+
+ /**
+ * Cloning of the object
+ * @return clone
+ * @throws CloneNotSupportedException
+ */
+ protected Object clone() throws CloneNotSupportedException {
+ DiploidSNPGenotypeLikelihoods c = (DiploidSNPGenotypeLikelihoods)super.clone();
+ c.log10Likelihoods = log10Likelihoods.clone();
+ return c;
+ }
+
+ protected void setToZero() {
+ log10Likelihoods = genotypeZeros.clone(); // likelihoods are all zeros
+ }
+
+ /**
+ * Returns an array of log10 likelihoods for each genotype, indexed by DiploidGenotype.ordinal values()
+ * @return likelihoods array
+ */
+ public double[] getLikelihoods() {
+ return log10Likelihoods;
+ }
+
+ // -------------------------------------------------------------------------------------
+ //
+ // add() routines. These are the workhorse routines for calculating the overall genotype
+ // likelihoods given observed bases and reads. Includes high-level operators all the
+ // way down to single base and qual functions.
+ //
+ // -------------------------------------------------------------------------------------
+
+ /**
+ * Updates likelihoods and posteriors to reflect the additional observations contained within the
+ * read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
+ * pileup
+ *
+ * @param pileup read pileup
+ * @param ignoreBadBases should we ignore bad bases?
+ * @param capBaseQualsAtMappingQual should we cap a base's quality by its read's mapping quality?
+ * @param minBaseQual the minimum base quality at which to consider a base valid
+ * @return the number of good bases found in the pileup
+ */
+ public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
+ int n = 0;
+
+ // for each fragment, add to the likelihoods
+ FragmentCollection fpile = pileup.toFragments();
+
+ for ( PileupElement p : fpile.getSingletonReads() )
+ n += add(p, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
+
+ for ( List overlappingPair : fpile.getOverlappingPairs() )
+ n += add(overlappingPair, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
+
+ return n;
+ }
+
+ public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
+ byte obsBase = elt.getBase();
+ byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
+ if ( qual == 0 )
+ return 0;
+
+ return add(obsBase, qual, (byte)0, (byte)0, 1);
+ }
+
+ public int add(List overlappingPair, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
+ final PileupElement p1 = overlappingPair.get(0);
+ final PileupElement p2 = overlappingPair.get(1);
+
+ final byte observedBase1 = p1.getBase();
+ final byte qualityScore1 = qualToUse(p1, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
+ final byte observedBase2 = p2.getBase();
+ final byte qualityScore2 = qualToUse(p2, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
+
+ if ( qualityScore1 == 0 ) {
+ if ( qualityScore2 == 0 ) // abort early if we didn't see any good bases
+ return 0;
+ else {
+ return add(observedBase2, qualityScore2, (byte)0, (byte)0);
+ }
+ } else {
+ return add(observedBase1, qualityScore1, observedBase2, qualityScore2);
+ }
+ }
+
+ /**
+ *
+ * @param obsBase1 first observed base
+ * @param qual1 base qual of first observed base
+ * @param obsBase2 second observed base
+ * @param qual2 base qual of second observed base; can be 0, indicating no second base was observed for this fragment
+ * @param nObs the number of times this quad of values was seen. Generally 1, but reduced reads can have nObs > 1 for synthetic reads
+ * @return 0 if the base is bad, 1 otherwise
+ */
+ private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2, int nObs) {
+ // TODO-- Right now we assume that there are at most 2 reads per fragment. This assumption is fine
+ // TODO-- given the current state of next-gen sequencing, but may need to be fixed in the future.
+ // TODO-- However, when that happens, we'll need to be a lot smarter about the caching we do here.
+
+ // Just look up the cached result if it's available, or compute and store it
+ DiploidSNPGenotypeLikelihoods gl;
+ if ( ! inCache(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY) ) {
+ gl = calculateCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY);
+ } else {
+ gl = getCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY);
+ }
+
+ // for bad bases, there are no likelihoods
+ if ( gl == null )
+ return 0;
+
+ double[] likelihoods = gl.getLikelihoods();
+
+ for ( DiploidGenotype g : DiploidGenotype.values() ) {
+ double likelihood = likelihoods[g.ordinal()];
+ log10Likelihoods[g.ordinal()] += likelihood * nObs;
+ }
+
+ return 1;
+ }
+
+ private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) {
+ return add(obsBase1, qual1, obsBase2, qual2, 1);
+ }
+
+ // -------------------------------------------------------------------------------------
+ //
+ // Dealing with the cache routines
+ //
+ // -------------------------------------------------------------------------------------
+
+ static DiploidSNPGenotypeLikelihoods[][][][][] CACHE = new DiploidSNPGenotypeLikelihoods[BaseUtils.BASES.length][QualityUtils.MAX_SAM_QUAL_SCORE +1][BaseUtils.BASES.length+1][QualityUtils.MAX_SAM_QUAL_SCORE +1][MAX_PLOIDY];
+
+ protected boolean inCache(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
+ return getCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy) != null;
+ }
+
+ protected DiploidSNPGenotypeLikelihoods getCachedGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
+ DiploidSNPGenotypeLikelihoods gl = getCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy);
+ if ( gl == null )
+ throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base1=%c, qual1=%d, base2=%c, qual2=%d, ploidy=%d",
+ observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy));
+ return gl;
+ }
+
+ protected DiploidSNPGenotypeLikelihoods calculateCachedGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
+ DiploidSNPGenotypeLikelihoods gl = calculateGenotypeLikelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2);
+ setCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy, gl);
+ return gl;
+ }
+
+ protected void setCache( DiploidSNPGenotypeLikelihoods[][][][][] cache,
+ byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy,
+ DiploidSNPGenotypeLikelihoods val ) {
+ int i = BaseUtils.simpleBaseToBaseIndex(observedBase1);
+ int j = qualityScore1;
+ int k = qualityScore2 != 0 ? BaseUtils.simpleBaseToBaseIndex(observedBase2) : BaseUtils.BASES.length;
+ int l = qualityScore2;
+ int m = ploidy;
+
+ cache[i][j][k][l][m] = val;
+ }
+
+ protected DiploidSNPGenotypeLikelihoods getCache(DiploidSNPGenotypeLikelihoods[][][][][] cache,
+ byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
+ int i = BaseUtils.simpleBaseToBaseIndex(observedBase1);
+ int j = qualityScore1;
+ int k = qualityScore2 != 0 ? BaseUtils.simpleBaseToBaseIndex(observedBase2) : BaseUtils.BASES.length;
+ int l = qualityScore2;
+ int m = ploidy;
+ return cache[i][j][k][l][m];
+ }
+
+ protected DiploidSNPGenotypeLikelihoods calculateGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2) {
+ double[] log10FourBaseLikelihoods = computeLog10Likelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2);
+
+ try {
+
+ DiploidSNPGenotypeLikelihoods gl = (DiploidSNPGenotypeLikelihoods)this.clone();
+ gl.setToZero();
+
+ // we need to adjust for ploidy. We take the raw p(obs | chrom) / ploidy, which is -log10(ploidy) in log space
+ for ( DiploidGenotype g : DiploidGenotype.values() ) {
+
+ // todo assumes ploidy is 2 -- should be generalized. Obviously the below code can be turned into a loop
+ double p_base = 0.0;
+ p_base += pow(10, log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base1)] - ploidyAdjustment);
+ p_base += pow(10, log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base2)] - ploidyAdjustment);
+
+ final double likelihood = log10(p_base);
+ gl.log10Likelihoods[g.ordinal()] += likelihood;
+ }
+
+ if ( VERBOSE ) {
+ for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%s\t", g); }
+ System.out.println();
+ for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%.2f\t", gl.log10Likelihoods[g.ordinal()]); }
+ System.out.println();
+ }
+
+ return gl;
+
+ } catch ( CloneNotSupportedException e ) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * Updates likelihoods and posteriors to reflect an additional observation of observedBase with
+ * qualityScore.
+ *
+ * @param observedBase1 the base observed on the 1st read of the fragment
+ * @param qualityScore1 the qual of the base on the 1st read of the fragment, or zero if NA
+ * @param observedBase2 the base observed on the 2nd read of the fragment
+ * @param qualityScore2 the qual of the base on the 2nd read of the fragment, or zero if NA
+ * @return likelihoods for this observation or null if the base was not considered good enough to add to the likelihoods (Q0 or 'N', for example)
+ */
+ protected double[] computeLog10Likelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2) {
+ double[] log10FourBaseLikelihoods = baseZeros.clone();
+
+ for ( byte trueBase : BaseUtils.BASES ) {
+ double likelihood = 0.0;
+
+ for ( byte fragmentBase : BaseUtils.BASES ) {
+ double log10FragmentLikelihood = (trueBase == fragmentBase ? log10_1_minus_PCR_error : log10_PCR_error_3);
+ if ( qualityScore1 != 0 ) {
+ log10FragmentLikelihood += log10PofObservingBaseGivenChromosome(observedBase1, fragmentBase, qualityScore1);
+ }
+ if ( qualityScore2 != 0 ) {
+ log10FragmentLikelihood += log10PofObservingBaseGivenChromosome(observedBase2, fragmentBase, qualityScore2);
+ }
+
+ //if ( VERBOSE ) {
+ // System.out.printf(" L(%c | b=%s, Q=%d) = %f / %f%n",
+ // observedBase, trueBase, qualityScore, pow(10,likelihood) * 100, likelihood);
+ //}
+
+ likelihood += pow(10, log10FragmentLikelihood);
+ }
+
+ log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(trueBase)] = log10(likelihood);
+ }
+
+ return log10FourBaseLikelihoods;
+ }
+
+ /**
+ *
+ * @param observedBase observed base
+ * @param chromBase target base
+ * @param qual base quality
+ * @return log10 likelihood
+ */
+ protected double log10PofObservingBaseGivenChromosome(byte observedBase, byte chromBase, byte qual) {
+
+ double logP;
+
+ if ( observedBase == chromBase ) {
+ // the base is consistent with the chromosome -- it's 1 - e
+ //logP = oneMinusData[qual];
+ double e = pow(10, (qual / -10.0));
+ logP = log10(1.0 - e);
+ } else {
+ // the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error)
+ logP = qual / -10.0 + (-log10_3);
+ }
+
+ //System.out.printf("%c %c %d => %f%n", observedBase, chromBase, qual, logP);
+ return logP;
+ }
+
+ /**
+ * Helper function that returns the phred-scaled base quality score we should use for calculating
+ * likelihoods for a pileup element. May return 0 to indicate that the observation is bad, and may
+ * cap the quality score by the mapping quality of the read itself.
+ *
+ * @param p Pileup element
+ * @param ignoreBadBases Should we ignore bad bases?
+ * @param capBaseQualsAtMappingQual Should we cap the base qualities at the mapping quality of the read?
+ * @param minBaseQual Minimum allowed base quality
+ * @return the actual base quality to use
+ */
+ private static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
+ if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) )
+ return 0;
+
+ byte qual = p.getQual();
+
+ if ( qual > SAMUtils.MAX_PHRED_SCORE )
+ throw new UserException.MisencodedBAM(p.getRead(), "we encountered an extremely high quality score (" + (int)qual + ")");
+ if ( capBaseQualsAtMappingQual )
+ qual = (byte) Math.min( 0xff & qual, p.getMappingQual());
+ if ( (int)qual < minBaseQual )
+ qual = (byte)0;
+
+ return qual;
+ }
+
+ // -----------------------------------------------------------------------------------------------------------------
+ //
+ //
+ // helper routines
+ //
+ //
+ // -----------------------------------------------------------------------------------------------------------------
+
+ /**
+ * Return a string representation of this object in a moderately usable form
+ *
+ * @return string representation
+ */
+ public String toString() {
+ double sum = 0;
+ StringBuilder s = new StringBuilder();
+ for (DiploidGenotype g : DiploidGenotype.values()) {
+ s.append(String.format("%s %.10f ", g, log10Likelihoods[g.ordinal()]));
+ sum += Math.pow(10,log10Likelihoods[g.ordinal()]);
+ }
+ s.append(String.format(" %f", sum));
+ return s.toString();
+ }
+
+ // -----------------------------------------------------------------------------------------------------------------
+ //
+ //
+ // Validation routines
+ //
+ //
+ // -----------------------------------------------------------------------------------------------------------------
+
+ public boolean validate() {
+ return validate(true);
+ }
+
+ public boolean validate(boolean throwException) {
+ try {
+ for ( DiploidGenotype g : DiploidGenotype.values() ) {
+ String bad = null;
+
+ int i = g.ordinal();
+ if ( ! MathUtils.wellFormedDouble(log10Likelihoods[i]) || ! MathUtils.isNegativeOrZero(log10Likelihoods[i]) ) {
+ bad = String.format("Likelihood %f is badly formed", log10Likelihoods[i]);
+ }
+
+ if ( bad != null ) {
+ throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad));
+ }
+ }
+ } catch ( IllegalStateException e ) {
+ if ( throwException )
+ throw new RuntimeException(e);
+ else
+ return false;
+ }
+
+ return true;
+ }
+
+ //
+ // Constant static data
+ //
+ private final static double[] genotypeZeros = new double[DiploidGenotype.values().length];
+ private final static double[] baseZeros = new double[BaseUtils.BASES.length];
+
+ static {
+ for ( DiploidGenotype g : DiploidGenotype.values() ) {
+ genotypeZeros[g.ordinal()] = 0.0;
+ }
+ for ( byte base : BaseUtils.BASES ) {
+ baseZeros[BaseUtils.simpleBaseToBaseIndex(base)] = 0.0;
+ }
+ }
+}
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java
new file mode 100644
index 000000000..a57502bc0
--- /dev/null
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java
@@ -0,0 +1,342 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import com.google.java.contract.Requires;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
+import org.broadinstitute.sting.utils.haplotype.Haplotype;
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.QualityUtils;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.variant.variantcontext.Allele;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: carneiro
+ * Date: 7/21/11
+ * Time: 2:21 PM
+ *
+ * This is a site based implementation of an Error Model. The error model is a probability
+ * distribution for the site given the phred scaled quality.
+ */
+public class ErrorModel {
+ private byte maxQualityScore;
+ private byte minQualityScore;
+ private byte phredScaledPrior;
+ private double log10minPower;
+ private int refDepth;
+ private boolean hasData = false;
+ private ProbabilityVector probabilityVector;
+ private static final boolean compressRange = false;
+
+ private static final double log10MinusE = Math.log10(Math.exp(1.0));
+ private static final boolean DEBUG = false;
+ /**
+ * Calculates the probability of the data (reference sample reads) given the phred scaled site quality score.
+ *
+ * @param UAC Argument Collection
+ * @param refSamplePileup Reference sample pileup
+ * @param refSampleVC VC with True alleles in reference sample pileup
+ */
+ public ErrorModel (final UnifiedArgumentCollection UAC,
+ final ReadBackedPileup refSamplePileup,
+ VariantContext refSampleVC, final ReferenceContext refContext) {
+ this.maxQualityScore = UAC.maxQualityScore;
+ this.minQualityScore = UAC.minQualityScore;
+ this.phredScaledPrior = UAC.phredScaledPrior;
+ log10minPower = Math.log10(UAC.minPower);
+
+ PairHMMIndelErrorModel pairModel = null;
+ LinkedHashMap haplotypeMap = null;
+ double[][] perReadLikelihoods = null;
+
+ double[] model = new double[maxQualityScore+1];
+ Arrays.fill(model,Double.NEGATIVE_INFINITY);
+
+ boolean hasCalledAlleles = false;
+
+ final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
+ if (refSampleVC != null) {
+
+ for (Allele allele : refSampleVC.getAlleles()) {
+ if (allele.isCalled()) {
+ hasCalledAlleles = true;
+ break;
+ }
+ }
+ haplotypeMap = new LinkedHashMap();
+ if (refSampleVC.isIndel()) {
+ pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
+ UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM);
+ IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements
+ }
+ }
+
+ double p = QualityUtils.qualToErrorProbLog10((byte)(maxQualityScore-minQualityScore));
+ if (refSamplePileup == null || refSampleVC == null || !hasCalledAlleles) {
+ for (byte q=minQualityScore; q<=maxQualityScore; q++) {
+ // maximum uncertainty if there's no ref data at site
+ model[q] = p;
+ }
+ this.refDepth = 0;
+ }
+ else {
+ hasData = true;
+ int matches = 0;
+ int coverage = 0;
+
+ Allele refAllele = refSampleVC.getReference();
+
+ if ( refSampleVC.isIndel()) {
+ //perReadLikelihoods = new double[readCounts.length][refSampleVC.getAlleles().size()];
+ final int eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(refSampleVC.getAlleles());
+ if (!haplotypeMap.isEmpty())
+ perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, perReadAlleleLikelihoodMap);
+ }
+ int idx = 0;
+ for (PileupElement refPileupElement : refSamplePileup) {
+ if (DEBUG)
+ System.out.println(refPileupElement.toString());
+ boolean isMatch = false;
+ for (Allele allele : refSampleVC.getAlleles()) {
+ boolean m = pileupElementMatches(refPileupElement, allele, refAllele, refContext.getBase());
+ if (DEBUG) System.out.println(m);
+ isMatch |= m;
+ }
+ if (refSampleVC.isIndel() && !haplotypeMap.isEmpty()) {
+ // ignore match/mismatch if reads, as determined by their likelihood, are not informative
+ double[] perAlleleLikelihoods = perReadLikelihoods[idx++];
+ if (!isInformativeElement(perAlleleLikelihoods))
+ matches++;
+ else
+ matches += (isMatch?1:0);
+
+ } else {
+ matches += (isMatch?1:0);
+ }
+ coverage++;
+ }
+
+ int mismatches = coverage - matches;
+ //System.out.format("Cov:%d match:%d mismatch:%d\n",coverage, matches, mismatches);
+ for (byte q=minQualityScore; q<=maxQualityScore; q++) {
+ if (coverage==0)
+ model[q] = p;
+ else
+ model[q] = log10PoissonProbabilitySiteGivenQual(q,coverage, mismatches);
+ }
+ this.refDepth = coverage;
+ }
+
+ // compress probability vector
+ this.probabilityVector = new ProbabilityVector(model, compressRange);
+ }
+
+
+ @Requires("likelihoods.length>0")
+ private boolean isInformativeElement(double[] likelihoods) {
+ // if likelihoods are the same, they're not informative
+ final double thresh = 0.1;
+ int maxIdx = MathUtils.maxElementIndex(likelihoods);
+ int minIdx = MathUtils.minElementIndex(likelihoods);
+ if (likelihoods[maxIdx]-likelihoods[minIdx]< thresh)
+ return false;
+ else
+ return true;
+ }
+ /**
+ * Simple constructor that just takes a given log-probability vector as error model.
+ * Only intended for unit testing, not general usage.
+ * @param pvector Given vector of log-probabilities
+ *
+ */
+ public ErrorModel(double[] pvector) {
+ this.maxQualityScore = (byte)(pvector.length-1);
+ this.minQualityScore = 0;
+ this.probabilityVector = new ProbabilityVector(pvector, compressRange);
+ this.hasData = true;
+
+ }
+
+ public static boolean pileupElementMatches(PileupElement pileupElement, Allele allele, Allele refAllele, byte refBase) {
+ if (DEBUG)
+ System.out.format("PE: base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d Allele:%s RefAllele:%s\n",
+ pileupElement.getBase(), pileupElement.isBeforeDeletionStart(),
+ pileupElement.isBeforeInsertion(),pileupElement.getBasesOfImmediatelyFollowingInsertion(),pileupElement.getLengthOfImmediatelyFollowingIndel(), allele.toString(), refAllele.toString());
+
+ //pileupElement.
+ // if test allele is ref, any base mismatch, or any insertion/deletion at start of pileup count as mismatch
+ if (allele.isReference()) {
+ // for a ref allele, any base mismatch or new indel is a mismatch.
+ if(allele.getBases().length>0)
+ // todo - can't check vs. allele because allele is not padded so it doesn't include the reference base at this location
+ // could clean up/simplify this when unpadding is removed
+ return (pileupElement.getBase() == refBase && !pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart());
+ else
+ // either null allele to compare, or ref/alt lengths are different (indel by definition).
+ // if we have an indel that we are comparing against a REF allele, any indel presence (of any length/content) is a mismatch
+ return (!pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart());
+ }
+
+ // for non-ref alleles to compare:
+ if (refAllele.getBases().length == allele.getBases().length)
+ // alleles have the same length (eg snp or mnp)
+ return pileupElement.getBase() == allele.getBases()[0];
+
+ // for non-ref alleles,
+ byte[] alleleBases = allele.getBases();
+ int eventLength = alleleBases.length - refAllele.getBases().length;
+ if (eventLength < 0 && pileupElement.isBeforeDeletionStart() && pileupElement.getLengthOfImmediatelyFollowingIndel() == -eventLength)
+ return true;
+
+ if (eventLength > 0 && pileupElement.isBeforeInsertion() &&
+ Arrays.equals(pileupElement.getBasesOfImmediatelyFollowingInsertion().getBytes(),Arrays.copyOfRange(alleleBases,1,alleleBases.length))) // allele contains ref byte, but pileupElement's event bases doesn't
+ return true;
+
+ return false;
+ }
+
+
+ /**
+ * What's the log-likelihood that a site's quality is equal to q? If we see N observations and n mismatches,
+ * and assuming each match is independent of each other and that the match probability is just dependent of
+ * the site quality, so p = 10.^-q/10.
+ * Since we'll normally have relatively high Q sites and deep coverage in reference samples (ie p small, N high),
+ * to avoid underflows we'll use the Poisson approximation with lambda = N*p.
+ * Hence, the log-likelihood of q i.e. Pr(Nmismatches = n | SiteQ = q) ~ Poisson(n | lambda = p*N) with p as above.
+ * @param q Desired q to get likelihood from
+ * @param coverage Total coverage
+ * @param mismatches Number of mismatches
+ * @return Likelihood of observations as a function of q
+ */
+ @Requires({
+ "q >= minQualityScore",
+ "q <= maxQualityScore",
+ "coverage >= 0",
+ "mismatches >= 0",
+ "mismatches <= coverage"
+ })
+ private double log10PoissonProbabilitySiteGivenQual(byte q, int coverage, int mismatches) {
+ // same as log10ProbabilitySiteGivenQual but with Poisson approximation to avoid numerical underflows
+ double lambda = QualityUtils.qualToErrorProb(q) * (double )coverage;
+ // log10(e^-lambda*lambda^k/k!) = -lambda + k*log10(lambda) - log10factorial(k)
+ return Math.log10(lambda)*mismatches - lambda*log10MinusE- MathUtils.log10Factorial(mismatches);
+ }
+
+ @Requires({"qual-minQualityScore <= maxQualityScore"})
+ public double getSiteLogErrorProbabilityGivenQual (int qual) {
+ return probabilityVector.getLogProbabilityForIndex(qual);
+ }
+
+ public byte getMaxQualityScore() {
+ return maxQualityScore;
+ }
+
+ public byte getMinQualityScore() {
+ return minQualityScore;
+ }
+
+ public int getMinSignificantQualityScore() {
+ return new ProbabilityVector(probabilityVector,true).getMinVal();
+ }
+
+ public int getMaxSignificantQualityScore() {
+ return new ProbabilityVector(probabilityVector,true).getMaxVal();
+ }
+
+ public int getReferenceDepth() {
+ return refDepth;
+ }
+ public boolean hasData() {
+ return hasData;
+ }
+
+ public ProbabilityVector getErrorModelVector() {
+ return probabilityVector;
+ }
+
+ public String toString() {
+ StringBuilder result = new StringBuilder("(");
+ boolean skipComma = true;
+ for (double v : probabilityVector.getProbabilityVector()) {
+ if (skipComma) {
+ skipComma = false;
+ }
+ else {
+ result.append(",");
+ }
+ result.append(String.format("%.4f", v));
+ }
+ result.append(")");
+ return result.toString();
+ }
+
+ public static int getTotalReferenceDepth(HashMap perLaneErrorModels) {
+ int n=0;
+ for (ErrorModel e : perLaneErrorModels.values()) {
+ n += e.getReferenceDepth();
+ }
+ return n;
+ }
+
+ /*
+@Requires({"maxAlleleCount >= 0"})
+//todo -- memoize this function
+ public boolean hasPowerForMaxAC (int maxAlleleCount) {
+ int siteQ = (int) Math.ceil(MathUtils.probabilityToPhredScale((double) 1/maxAlleleCount));
+ double log10CumSum = getCumulativeSum(siteQ);
+ return log10CumSum < log10minPower;
+ } */
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java
new file mode 100644
index 000000000..530ba3ef8
--- /dev/null
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java
@@ -0,0 +1,269 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset;
+import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
+import org.broadinstitute.sting.utils.haplotype.Haplotype;
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.variant.variantcontext.Allele;
+
+import java.util.*;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: delangel
+ * Date: 5/18/12
+ * Time: 10:06 AM
+ * To change this template use File | Settings | File Templates.
+ */
+public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotypeLikelihoods {
+ final PairHMMIndelErrorModel pairModel;
+ final LinkedHashMap haplotypeMap;
+ final ReferenceContext refContext;
+ final int eventLength;
+ double[][] readHaplotypeLikelihoods;
+
+ final byte refBase;
+ final org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap;
+
+ public GeneralPloidyIndelGenotypeLikelihoods(final List alleles,
+ final double[] logLikelihoods,
+ final int ploidy,
+ final HashMap perLaneErrorModels,
+ final boolean ignoreLaneInformation,
+ final PairHMMIndelErrorModel pairModel,
+ final LinkedHashMap haplotypeMap,
+ final ReferenceContext referenceContext,
+ final org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap) {
+ super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation);
+ this.pairModel = pairModel;
+ this.haplotypeMap = haplotypeMap;
+ this.refContext = referenceContext;
+ this.eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(alleles);
+ // todo - not needed if indel alleles have base at current position
+ this.refBase = referenceContext.getBase();
+ this.perReadAlleleLikelihoodMap = perReadAlleleLikelihoodMap;
+ }
+
+ // -------------------------------------------------------------------------------------
+ //
+ // add() routines. These are the workhorse routines for calculating the overall genotype
+ // likelihoods given observed bases and reads. Includes high-level operators all the
+ // way down to single base and qual functions.
+ //
+ // -------------------------------------------------------------------------------------
+
+ /**
+ * Updates likelihoods and posteriors to reflect the additional observations contained within the
+ * read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
+ * pileup
+ *
+ * @param pileup read pileup
+ * @param UAC the minimum base quality at which to consider a base valid
+ * @return the number of good bases found in the pileup
+ */
+ public int add(ReadBackedPileup pileup, UnifiedArgumentCollection UAC) {
+ int n = 0;
+
+ if (!hasReferenceSampleData) {
+ // no error models
+ return add(pileup, (ErrorModel)null);
+ }
+ for (String laneID : perLaneErrorModels.keySet() ) {
+ // get pileup for this lane
+ ReadBackedPileup perLanePileup;
+ if (ignoreLaneInformation)
+ perLanePileup = pileup;
+ else
+ perLanePileup = pileup.getPileupForLane(laneID);
+
+ if (perLanePileup == null || perLanePileup.isEmpty())
+ continue;
+
+ ErrorModel errorModel = perLaneErrorModels.get(laneID);
+ n += add(perLanePileup, errorModel);
+ if (ignoreLaneInformation)
+ break;
+
+ }
+
+ return n;
+ }
+
+ /**
+ * Calculates the pool's probability for all possible allele counts for all indel alleles observed.
+ * Calculation is based on the error model
+ * generated by the reference sample on the same lane. The probability is given by :
+ *
+ * Pr(ac = j1,j2,.. | pool, errorModel) = sum_over_all_Qs ( Pr(j1,j2,.. * Pr(errorModel_q) *
+ * Pr(ac=j1,j2,..| pool, errorModel) = sum_over_all_Qs ( Pr(ac=j1,j2,..) * Pr(errorModel_q) *
+ * [j1 * (1-eq)/2n + eq/3*(2*N-j1)
+ * [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC *
+ * jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT
+ *
+ * log Pr(ac=jA,jC,jG,jT| pool, errorModel) = logsum( Pr(ac=jA,jC,jG,jT) * Pr(errorModel_q) *
+ * [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC *
+ * jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT)
+ * = logsum(logPr(ac=jA,jC,jG,jT) + log(Pr(error_Model(q)
+ * )) + nA*log(jA/2N(1-eq)+eq/3*(2N-jA)/2N) + nC*log(jC/2N(1-eq)+eq/3*(2N-jC)/2N)
+ * + log(jG/2N(1-eq)+eq/3*(2N-jG)/2N) + log(jT/2N(1-eq)+eq/3*(2N-jT)/2N)
+ *
+ * Let Q(j,k) = log(j/2N*(1-e[k]) + (2N-j)/2N*e[k]/3)
+ *
+ * Then logPr(ac=jA,jC,jG,jT|D,errorModel) = logPR(ac=Ja,jC,jG,jT) + logsum_k( logPr (errorModel[k],
+ * nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
+ *
+ * If pileup data comes from several error models (because lanes can have different error models),
+ * Pr(Ac=j|D,E1,E2) = sum(Pr(AC1=j1|D,E1,E2) * Pr(AC2=j-j2|D,E1,E2))
+ * = sum(Pr(AC1=j1|D,E1)*Pr(AC2=j-j1|D,E2)) from j=0..2N
+ *
+ * So, for each lane, build error model and combine lanes.
+ * To store model, can do
+ * for jA=0:2N
+ * for jC = 0:2N-jA
+ * for jG = 0:2N-jA-jC
+ * for jT = 0:2N-jA-jC-jG
+ * Q(jA,jC,jG,jT)
+ * for k = minSiteQual:maxSiteQual
+ * likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
+ *
+ *
+ *
+ * where: nA,nC,nG,nT = counts of bases observed in pileup.
+ *
+ *
+ * @param pileup Base pileup
+ * @param errorModel Site error model
+ * @return Number of bases added
+ */
+ private int add(ReadBackedPileup pileup, ErrorModel errorModel) {
+ int n=0;
+
+ // Number of alleless in pileup, in that order
+ List numSeenBases = new ArrayList(this.alleles.size());
+
+ if (!hasReferenceSampleData) {
+
+ readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, perReadAlleleLikelihoodMap);
+ n = readHaplotypeLikelihoods.length;
+ } else {
+ Allele refAllele = null;
+ for (Allele a:alleles) {
+ numSeenBases.add(0);
+ if (a.isReference())
+ refAllele = a;
+ }
+
+ if (refAllele == null)
+ throw new ReviewedStingException("BUG: no ref alleles in passed in allele list!");
+
+ // count number of elements in pileup
+ for (PileupElement elt : pileup) {
+ if (VERBOSE)
+ System.out.format("base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d\n",elt.getBase(), elt.isBeforeDeletionStart(),elt.isBeforeInsertion(),elt.getBasesOfImmediatelyFollowingInsertion(),elt.getLengthOfImmediatelyFollowingIndel());
+ int idx =0;
+ for (Allele allele : alleles) {
+ int cnt = numSeenBases.get(idx);
+ numSeenBases.set(idx++,cnt + (ErrorModel.pileupElementMatches(elt, allele, refAllele, refBase)?1:0));
+ }
+
+ n++;
+
+ }
+ }
+ computeLikelihoods(errorModel, alleles, numSeenBases, pileup);
+ return n;
+ }
+
+
+
+ /**
+ * Compute likelihood of current conformation
+ *
+ * @param ACset Count to compute
+ * @param errorModel Site-specific error model object
+ * @param alleleList List of alleles
+ * @param numObservations Number of observations for each allele in alleleList
+ */
+ public void getLikelihoodOfConformation(final ExactACset ACset,
+ final ErrorModel errorModel,
+ final List alleleList,
+ final List numObservations,
+ final ReadBackedPileup pileup) {
+ final int[] currentCnt = Arrays.copyOf(ACset.getACcounts().getCounts(), alleleList.size());
+ double p1 = 0.0;
+
+ if (!hasReferenceSampleData) {
+ // no error model: use pair HMM likelihoods
+ for (int i=0; i < readHaplotypeLikelihoods.length; i++) {
+ double acc[] = new double[alleleList.size()];
+ for (int k=0; k < acc.length; k++ )
+ acc[k] = readHaplotypeLikelihoods[i][k] + MathUtils.log10Cache[currentCnt[k]]-LOG10_PLOIDY;
+ p1 += MathUtils.log10sumLog10(acc);
+ }
+
+ } else {
+ final int minQ = errorModel.getMinSignificantQualityScore();
+ final int maxQ = errorModel.getMaxSignificantQualityScore();
+ final double[] acVec = new double[maxQ - minQ + 1];
+
+
+ for (int k=minQ; k<=maxQ; k++) {
+ int idx=0;
+ for (int n : numObservations)
+ acVec[k-minQ] += n*logMismatchProbabilityArray[currentCnt[idx++]][k];
+ }
+ p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ, maxQ), acVec);
+ }
+ ACset.getLog10Likelihoods()[0] = p1;
+ }
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java
new file mode 100644
index 000000000..95d3fb78b
--- /dev/null
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java
@@ -0,0 +1,141 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.variant.variantcontext.Allele;
+import org.broadinstitute.variant.variantcontext.VariantContext;
+
+import java.util.List;
+import java.util.Map;
+
+
+/**
+ * The model representing how we calculate genotype likelihoods
+ */
+public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
+
+ public static final String DUMMY_LANE = "Lane1";
+ public static final String DUMMY_SAMPLE_NAME = "DummySample1";
+
+ /* public enum Model {
+ SNP,
+ INDEL,
+ BOTH
+ }
+ */
+ public enum Model {
+ SNP,
+ INDEL,
+ GENERALPLOIDYSNP,
+ GENERALPLOIDYINDEL,
+ BOTH
+ }
+
+ public enum GENOTYPING_MODE {
+ /** the Unified Genotyper will choose the most likely alternate allele */
+ DISCOVERY,
+ /** only the alleles passed in from a VCF rod bound to the -alleles argument will be used for genotyping */
+ GENOTYPE_GIVEN_ALLELES
+ }
+
+ protected final UnifiedArgumentCollection UAC;
+ protected Logger logger;
+
+ /**
+ * Create a new object
+ * @param logger logger
+ * @param UAC unified arg collection
+ */
+ protected GenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
+ if ( logger == null || UAC == null ) throw new ReviewedStingException("Bad arguments");
+ this.UAC = UAC;
+ this.logger = logger;
+ }
+
+ /**
+ * Can be overridden by concrete subclasses
+ *
+ * @param tracker rod data
+ * @param ref reference context
+ * @param contexts stratified alignment contexts
+ * @param contextType stratified context type
+ * @param allAllelesToUse the alternate allele to use, null if not set
+ * @param useBAQedPileup should we use the BAQed pileup or the raw one?
+ * @param locParser Genome Loc Parser
+ * @return variant context where genotypes are no-called but with GLs
+ */
+ public abstract VariantContext getLikelihoods(final RefMetaDataTracker tracker,
+ final ReferenceContext ref,
+ final Map contexts,
+ final AlignmentContextUtils.ReadOrientation contextType,
+ final List allAllelesToUse,
+ final boolean useBAQedPileup,
+ final GenomeLocParser locParser,
+ final Map perReadAlleleLikelihoodMap);
+
+
+ protected int getFilteredDepth(ReadBackedPileup pileup) {
+ int count = 0;
+ for ( PileupElement p : pileup ) {
+ if ( BaseUtils.isRegularBase( p.getBase() ) )
+ count++;
+ }
+
+ return count;
+ }
+
+}
\ No newline at end of file
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriors.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriors.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriors.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriors.java
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
new file mode 100644
index 000000000..ae2ea427b
--- /dev/null
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
@@ -0,0 +1,262 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.haplotype.Haplotype;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.variant.variantcontext.*;
+
+import java.util.*;
+
+public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
+ private static final int HAPLOTYPE_SIZE = 80;
+
+ private boolean DEBUG = false;
+ private boolean ignoreSNPAllelesWhenGenotypingIndels = false;
+ private PairHMMIndelErrorModel pairModel;
+
+
+ private LinkedHashMap haplotypeMap;
+
+ private List alleleList = new ArrayList();
+
+
+ protected IndelGenotypeLikelihoodsCalculationModel(final UnifiedArgumentCollection UAC,
+ final Logger logger) {
+ super(UAC, logger);
+ pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
+ UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM);
+ DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
+ haplotypeMap = new LinkedHashMap();
+ ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES;
+ }
+
+ protected static List computeConsensusAlleles(final ReferenceContext ref,
+ final Map contexts,
+ final AlignmentContextUtils.ReadOrientation contextType,
+ final UnifiedArgumentCollection UAC) {
+ ConsensusAlleleCounter counter = new ConsensusAlleleCounter(true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE);
+ return counter.computeConsensusAlleles(ref, contexts, contextType);
+ }
+
+ private final static EnumSet allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED);
+
+
+ public VariantContext getLikelihoods(final RefMetaDataTracker tracker,
+ final ReferenceContext ref,
+ final Map contexts,
+ final AlignmentContextUtils.ReadOrientation contextType,
+ final List allAllelesToUse,
+ final boolean useBAQedPileup,
+ final GenomeLocParser locParser,
+ final Map perReadAlleleLikelihoodMap) {
+
+ GenomeLoc loc = ref.getLocus();
+// if (!ref.getLocus().equals(lastSiteVisited)) {
+ if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) {
+ // starting a new site: clear allele list
+ haplotypeMap.clear();
+ perReadAlleleLikelihoodMap.clear(); // clean mapping sample-> per read, per allele likelihoods
+ alleleList = getInitialAlleleList(tracker, ref, contexts, contextType, UAC, ignoreSNPAllelesWhenGenotypingIndels);
+ if (alleleList.isEmpty())
+ return null;
+ }
+
+ getHaplotypeMapFromAlleles(alleleList, ref, loc, haplotypeMap); // will update haplotypeMap adding elements
+ if (haplotypeMap == null || haplotypeMap.isEmpty())
+ return null;
+
+ // start making the VariantContext
+ // For all non-snp VC types, VC end location is just startLocation + length of ref allele including padding base.
+ final int endLoc = loc.getStart() + alleleList.get(0).length() - 1;
+ final int eventLength = getEventLength(alleleList);
+
+ final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleleList);
+
+ // create the genotypes; no-call everyone for now
+ GenotypesContext genotypes = GenotypesContext.create();
+ final List noCall = new ArrayList();
+ noCall.add(Allele.NO_CALL);
+
+ // For each sample, get genotype likelihoods based on pileup
+ // compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them.
+
+ for (Map.Entry sample : contexts.entrySet()) {
+ AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
+
+ if (!perReadAlleleLikelihoodMap.containsKey(sample.getKey())){
+ // no likelihoods have been computed for this sample at this site
+ perReadAlleleLikelihoodMap.put(sample.getKey(), new PerReadAlleleLikelihoodMap());
+ }
+ final ReadBackedPileup pileup = context.getBasePileup();
+ if (pileup != null) {
+ final GenotypeBuilder b = new GenotypeBuilder(sample.getKey());
+ final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.getSampleContamination().get(sample.getKey()));
+ b.PL(genotypeLikelihoods);
+ b.DP(getFilteredDepth(pileup));
+ genotypes.add(b.make());
+
+ if (DEBUG) {
+ System.out.format("Sample:%s Alleles:%s GL:", sample.getKey(), alleleList.toString());
+ for (int k = 0; k < genotypeLikelihoods.length; k++)
+ System.out.format("%1.4f ", genotypeLikelihoods[k]);
+ System.out.println();
+ }
+ }
+ }
+
+ return builder.genotypes(genotypes).make();
+ }
+
+ public static void getHaplotypeMapFromAlleles(final List alleleList,
+ final ReferenceContext ref,
+ final GenomeLoc loc,
+ final LinkedHashMap haplotypeMap) {
+ // protect against having an indel too close to the edge of a contig
+ if (loc.getStart() <= HAPLOTYPE_SIZE)
+ haplotypeMap.clear();
+ // check if there is enough reference window to create haplotypes (can be an issue at end of contigs)
+ else if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE)
+ haplotypeMap.clear();
+ else if (alleleList.isEmpty())
+ haplotypeMap.clear();
+ else {
+ final int eventLength = getEventLength(alleleList);
+ final int hsize = ref.getWindow().size() - Math.abs(eventLength) - 1;
+ final int numPrefBases = ref.getLocus().getStart() - ref.getWindow().getStart() + 1;
+
+ if (hsize <= 0) // protect against event lengths larger than ref window sizes
+ haplotypeMap.clear();
+ else
+ haplotypeMap.putAll(Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(),
+ ref, hsize, numPrefBases));
+ }
+ }
+
+ public static int getEventLength(List alleleList) {
+ Allele refAllele = alleleList.get(0);
+ Allele altAllele = alleleList.get(1);
+ // look for alt allele that has biggest length distance to ref allele
+ int maxLenDiff = 0;
+ for (Allele a : alleleList) {
+ if (a.isNonReference()) {
+ int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length());
+ if (lenDiff > maxLenDiff) {
+ maxLenDiff = lenDiff;
+ altAllele = a;
+ }
+ }
+ }
+
+ return altAllele.getBaseString().length() - refAllele.getBaseString().length();
+
+ }
+
+ public static List getInitialAlleleList(final RefMetaDataTracker tracker,
+ final ReferenceContext ref,
+ final Map contexts,
+ final AlignmentContextUtils.ReadOrientation contextType,
+ final UnifiedArgumentCollection UAC,
+ final boolean ignoreSNPAllelesWhenGenotypingIndels) {
+
+ List alleles = new ArrayList();
+ if (UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) {
+ VariantContext vc = null;
+ for (final VariantContext vc_input : tracker.getValues(UAC.alleles, ref.getLocus())) {
+ if (vc_input != null &&
+ allowableTypes.contains(vc_input.getType()) &&
+ ref.getLocus().getStart() == vc_input.getStart()) {
+ vc = vc_input;
+ break;
+ }
+ }
+ // ignore places where we don't have a variant
+ if (vc == null)
+ return alleles;
+
+ if (ignoreSNPAllelesWhenGenotypingIndels) {
+ // if there's an allele that has same length as the reference (i.e. a SNP or MNP), ignore it and don't genotype it
+ for (Allele a : vc.getAlleles())
+ if (a.isNonReference() && a.getBases().length == vc.getReference().getBases().length)
+ continue;
+ else
+ alleles.add(a);
+
+ } else {
+ alleles.addAll(vc.getAlleles());
+ }
+
+ } else {
+ alleles = computeConsensusAlleles(ref, contexts, contextType, UAC);
+ }
+ return alleles;
+ }
+
+ // Overload function in GenotypeLikelihoodsCalculationModel so that, for an indel case, we consider a deletion as part of the pileup,
+ // so that per-sample DP will include deletions covering the event.
+ protected int getFilteredDepth(ReadBackedPileup pileup) {
+ int count = 0;
+ for (PileupElement p : pileup) {
+ if (p.isDeletion() || BaseUtils.isRegularBase(p.getBase()))
+ count++;
+ }
+
+ return count;
+ }
+
+}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypePriors.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypePriors.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypePriors.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypePriors.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ProbabilityVector.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ProbabilityVector.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ProbabilityVector.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ProbabilityVector.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
similarity index 100%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
new file mode 100644
index 000000000..c5070a76f
--- /dev/null
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
@@ -0,0 +1,844 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import com.google.java.contract.Ensures;
+import com.google.java.contract.Requires;
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.commandline.RodBinding;
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
+import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalc;
+import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
+import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcResult;
+import org.broadinstitute.sting.utils.*;
+import org.broadinstitute.sting.utils.baq.BAQ;
+import org.broadinstitute.sting.utils.classloader.PluginManager;
+import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.variant.vcf.VCFConstants;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.variant.variantcontext.*;
+
+import java.io.PrintStream;
+import java.lang.reflect.Constructor;
+import java.util.*;
+
+public class UnifiedGenotyperEngine {
+ public static final String LOW_QUAL_FILTER_NAME = "LowQual";
+ private static final String GPSTRING = "GENERALPLOIDY";
+
+ public static final String NUMBER_OF_DISCOVERED_ALLELES_KEY = "NDA";
+ public static final String PL_FOR_ALL_SNP_ALLELES_KEY = "APL";
+
+ public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3;
+ public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4;
+
+ private static final int SNP_MODEL = 0;
+ private static final int INDEL_MODEL = 1;
+
+ public enum OUTPUT_MODE {
+ /** produces calls only at variant sites */
+ EMIT_VARIANTS_ONLY,
+ /** produces calls at variant sites and confident reference sites */
+ EMIT_ALL_CONFIDENT_SITES,
+ /** produces calls at any callable site regardless of confidence; this argument is intended only for point
+ * mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by
+ * no means produce a comprehensive set of indels in DISCOVERY mode */
+ EMIT_ALL_SITES
+ }
+
+ // the unified argument collection
+ private final UnifiedArgumentCollection UAC;
+ public UnifiedArgumentCollection getUAC() { return UAC; }
+
+ // the annotation engine
+ private final VariantAnnotatorEngine annotationEngine;
+
+ // the model used for calculating genotypes
+ private ThreadLocal